def setup_ignite( engine: Engine, params: SimpleNamespace, exp_source, run_name: str, extra_metrics: Iterable[str] = (), ): warnings.simplefilter("ignore", category=UserWarning) handler = ptan_ignite.EndOfEpisodeHandler(exp_source, bound_avg_reward=params.stop_reward) handler.attach(engine) ptan_ignite.EpisodeFPSHandler().attach(engine) @engine.on(ptan_ignite.EpisodeEvents.EPISODE_COMPLETED) def episode_completed(trainer: Engine): passed = trainer.state.metrics.get("time_passed", 0) print( "Episode %d: reward=%.2f, steps=%s, " "speed=%.1f f/s, elapsed=%s" % ( trainer.state.episode, trainer.state.episode_reward, trainer.state.episode_steps, trainer.state.metrics.get("avg_fps", 0), timedelta(seconds=int(passed)), ) ) @engine.on(ptan_ignite.EpisodeEvents.BOUND_REWARD_REACHED) def game_solved(trainer: Engine): passed = trainer.state.metrics["time_passed"] print( "Game solved in %s, after %d episodes " "and %d iterations!" % (timedelta(seconds=int(passed)), trainer.state.episode, trainer.state.iteration) ) trainer.should_terminate = True now = datetime.now().isoformat(timespec="minutes") logdir = f"runs/{now}-{params.run_name}-{run_name}" tb = tb_logger.TensorboardLogger(log_dir=logdir) run_avg = RunningAverage(output_transform=lambda v: v["loss"]) run_avg.attach(engine, "avg_loss") metrics = ["reward", "steps", "avg_reward"] handler = tb_logger.OutputHandler(tag="episodes", metric_names=metrics) event = ptan_ignite.EpisodeEvents.EPISODE_COMPLETED tb.attach(engine, log_handler=handler, event_name=event) # write to tensorboard every 100 iterations ptan_ignite.PeriodicEvents().attach(engine) metrics = ["avg_loss", "avg_fps"] metrics.extend(extra_metrics) handler = tb_logger.OutputHandler( tag="train", metric_names=metrics, output_transform=lambda a: a ) event = ptan_ignite.PeriodEvents.ITERS_100_COMPLETED tb.attach(engine, log_handler=handler, event_name=event)
def setup_ignite(engine: Engine, params: SimpleNamespace, exp_source, run_name: str, extra_metrics: Iterable[str] = ()): # get rid of missing metrics warning warnings.simplefilter("ignore", category=UserWarning) # das Ding feuert EpisodenEnde Events / hört selber auf ITERATION_COMPLETED ptan_ignite.EndOfEpisodeHandler( exp_source, bound_avg_reward=params.stop_reward).attach(engine) ptan_ignite.EpisodeFPSHandler().attach(engine) @engine.on(ptan_ignite.EpisodeEvents.EPISODE_COMPLETED) def episode_completed(trainer: Engine): passed = trainer.state.metrics.get('time_passed', 0) print("Episode %d: reward=%.0f, steps=%s, " "speed=%.1f f/s, elapsed=%s" % (trainer.state.episode, trainer.state.episode_reward, trainer.state.episode_steps, trainer.state.metrics.get('avg_fps', 0), timedelta(seconds=int(passed)))) @engine.on(ptan_ignite.EpisodeEvents.BOUND_REWARD_REACHED) def game_solved(trainer: Engine): passed = trainer.state.metrics['time_passed'] print("Game solved in %s, after %d episodes " "and %d iterations!" % (timedelta(seconds=int(passed)), trainer.state.episode, trainer.state.iteration)) trainer.should_terminate = True RunningAverage(output_transform=lambda v: v['loss']).attach( engine, "avg_loss") now = datetime.now().isoformat(timespec='minutes') now = now.replace(":", "") logdir = f"runs/{now}-{params.run_name}-{run_name}" tb = tb_logger.TensorboardLogger(log_dir=logdir) handler = tb_logger.OutputHandler( tag="episodes", metric_names=['reward', 'steps', 'avg_reward']) tb.attach(engine, log_handler=handler, event_name=ptan_ignite.EpisodeEvents.EPISODE_COMPLETED) # write to tensorboard every 100 iterations ptan_ignite.PeriodicEvents().attach(engine) metrics = ['avg_loss', 'avg_fps'] metrics.extend(extra_metrics) handler = tb_logger.OutputHandler(tag="train", metric_names=metrics, output_transform=lambda a: a) tb.attach(engine, log_handler=handler, event_name=ptan_ignite.PeriodEvents.ITERS_100_COMPLETED)
def setup_ignite(engine: Engine, exp_source, run_name: str, extra_metrics: Iterable[str] = ()): # get rid of missing metrics warning warnings.simplefilter("ignore", category=UserWarning) handler = ptan_ignite.EndOfEpisodeHandler(exp_source) handler.attach(engine) ptan_ignite.EpisodeFPSHandler().attach(engine) @engine.on(ptan_ignite.EpisodeEvents.EPISODE_COMPLETED) def episode_completed(trainer: Engine): passed = trainer.state.metrics.get("time_passed", 0) avg_steps = trainer.state.metrics.get("avg_steps", 50) avg_reward = trainer.state.metrics.get("avg_reward", 0.0) print("Episode %d: reward=%.0f (avg %.2f), " "steps=%s (avg %.2f), speed=%.1f f/s, " "elapsed=%s" % ( trainer.state.episode, trainer.state.episode_reward, avg_reward, trainer.state.episode_steps, avg_steps, trainer.state.metrics.get("avg_fps", 0), timedelta(seconds=int(passed)), )) if avg_steps < 15 and trainer.state.episode > 100: print("Average steps has fallen below 10, stop training") trainer.should_terminate = True now = datetime.now().isoformat(timespec="minutes") logdir = f"runs/{now}-{run_name}" tb = tb_logger.TensorboardLogger(log_dir=logdir) run_avg = RunningAverage(output_transform=lambda v: v["loss"]) run_avg.attach(engine, "avg_loss") metrics = ["reward", "steps", "avg_reward", "avg_steps"] handler = tb_logger.OutputHandler(tag="episodes", metric_names=metrics) event = ptan_ignite.EpisodeEvents.EPISODE_COMPLETED tb.attach(engine, log_handler=handler, event_name=event) # write to tensorboard every 100 iterations ptan_ignite.PeriodicEvents().attach(engine) metrics = ["avg_loss", "avg_fps"] metrics.extend(extra_metrics) handler = tb_logger.OutputHandler(tag="train", metric_names=metrics, output_transform=lambda a: a) event = ptan_ignite.PeriodEvents.ITERS_100_COMPLETED tb.attach(engine, log_handler=handler, event_name=event)
def setup_ignite(engine: Engine, exp_source, run_name: str, extra_metrics: Iterable[str] = ()): warnings.simplefilter('ignore', category=UserWarning) handler = ptan_ignite.EndOfEpisodeHandler(exp_source, subsample_end_of_episode=100) handler.attach(engine) ptan_ignite.EpisodeFPSHandler().attach(engine) @engine.on(ptan_ignite.EpisodeEvents.EPISODE_COMPLETED) def episode_completed(trainer: Engine): passed = trainer.state.metrics.get('time_passed', 0) print('Episode %d: reward=%0.f, steps=%s, speed=%.1f f/s, elapsed=%s' % (trainer.state.episode, trainer.state.episode_reward, trainer.state.episode_steps, trainer.state.metrics.get('avg_fps', 0), timedelta(seconds=int(passed)))) now = datetime.now().isoformat(timespec='minutes') logdir = f'runs-{now}-{run_name}'.replace(':', '') tb = tb_logger.TensorboardLogger(logdir=logdir) run_avg = RunningAverage(output_transform=lambda v: v['loss']) run_avg.attach(engine, 'avg_loss') metrics = ['reward', 'steps', 'avg_reward'] handler = tb_logger.OutputHandler(tag='episodes', metric_names=metrics) event = ptan_ignite.EpisodeEvents.EPISODE_COMPLETED tb.attach(engine, log_handler=handler, event_name=event) # write to tensorboard every 100 iterations ptan_ignite.PeriodicEvents().attach(engine) metrics = ['avg_loss', 'avg_fps'] metrics.extend(extra_metrics) handler = tb_logger.OutputHandler(tag='train', metric_names=metrics, output_transform=lambda a: a) event = ptan_ignite.PeriodEvents.ITERS_1000_COMPLETED tb.attach(engine, log_handler=handler, event_name=event) return tb
def setup_ignite(engine: Engine, exp_source, run_name: str, extra_metrics: Iterable[str] = ()): # get rid of missing metrics warning warnings.simplefilter("ignore", category=UserWarning) handler = ptan_ignite.EndOfEpisodeHandler(exp_source, subsample_end_of_episode=100) handler.attach(engine) ptan_ignite.EpisodeFPSHandler().attach(engine) @engine.on(ptan_ignite.EpisodeEvents.EPISODE_COMPLETED) def episode_completed(trainer: Engine): passed = trainer.state.metrics.get("time_passed", 0) print("Episode %d: reward=%.0f, steps=%s, " "speed=%.1f f/s, elapsed=%s" % ( trainer.state.episode, trainer.state.episode_reward, trainer.state.episode_steps, trainer.state.metrics.get("avg_fps", 0), timedelta(seconds=int(passed)), )) now = datetime.now().isoformat(timespec="minutes") logdir = f"runs/{now}-{run_name}" tb = tb_logger.TensorboardLogger(log_dir=logdir) run_avg = RunningAverage(output_transform=lambda v: v["loss"]) run_avg.attach(engine, "avg_loss") metrics = ["reward", "steps", "avg_reward"] handler = tb_logger.OutputHandler(tag="episodes", metric_names=metrics) event = ptan_ignite.EpisodeEvents.EPISODE_COMPLETED tb.attach(engine, log_handler=handler, event_name=event) ptan_ignite.PeriodicEvents().attach(engine) metrics = ["avg_loss", "avg_fps"] metrics.extend(extra_metrics) handler = tb_logger.OutputHandler(tag="train", metric_names=metrics, output_transform=lambda a: a) event = ptan_ignite.PeriodEvents.ITERS_1000_COMPLETED tb.attach(engine, log_handler=handler, event_name=event) return tb
def setup_ignite(engine: Engine, params: SimpleNamespace, exp_source, run_name: str, net, extra_metrics: Iterable[str] = ()): warnings.simplefilter("ignore", category=UserWarning) handler = ptan_ignite.EndOfEpisodeHandler(exp_source, subsample_end_of_episode=100) handler.attach(engine) ptan_ignite.EpisodeFPSHandler().attach(engine) @engine.on(ptan_ignite.EpisodeEvents.EPISODE_COMPLETED) def episode_completed(trainer: Engine): passed = trainer.state.metrics.get('time_passed', 0) print("Episode %d: reward=%.0f, steps=%s, " "elapsed=%s" % (trainer.state.episode, trainer.state.episode_reward, trainer.state.episode_steps, timedelta(seconds=int(passed)))) path = './saves/(episode-%.3f.data' % trainer.state.episode torch.save(net.state_dict(), path) now = datetime.now().isoformat(timespec='minutes').replace(':', '') logdir = f"runs2/{now}-{params.run_name}-{run_name}" tb = tb_logger.TensorboardLogger(log_dir=logdir) run_avg = RunningAverage(output_transform=lambda v: v['loss']) run_avg.attach(engine, 'avg_loss') metrics = ['reward', 'steps', 'avg_reward'] handler = tb_logger.OutputHandler(tag='episodes', metric_names=metrics) event = ptan_ignite.EpisodeEvents.EPISODE_COMPLETED tb.attach(engine, log_handler=handler, event_name=event) # write to tb every 100 Iterations ptan_ignite.PeriodicEvents().attach(engine) metrics = ['avg_loss', 'avg_fps'] metrics.extend(extra_metrics) handler = tb_logger.OutputHandler(tag="train", metric_names=metrics, output_transform=lambda a: a) event = ptan_ignite.PeriodEvents.ITERS_1000_COMPLETED tb.attach(engine, log_handler=handler, event_name=event) return tb
print( "Episode %d: reward=%s, steps=%s, speed=%.3f frames/s, elapsed=%s" % (trainer.state.episode, trainer.state.episode_reward, trainer.state.episode_steps, trainer.state.metrics.get('fps', 0), timedelta(seconds=trainer.state.metrics.get('time_passed', 0)))) @engine.on(ptan_ignite.EpisodeEvents.BOUND_REWARD_REACHED) def game_solved(trainer: Engine): print("Game solved in %s, after %d episodes and %d iterations!" % (timedelta(seconds=trainer.state.metrics['time_passed']), trainer.state.episode, trainer.state.iteration)) trainer.should_terminate = True logdir = f"runs/{datetime.now().isoformat(timespec='minutes')}-{params.run_name}-{NAME}={args.envs}" tb = tb_logger.TensorboardLogger(log_dir=logdir) RunningAverage(output_transform=lambda v: v['loss']).attach( engine, "avg_loss") episode_handler = tb_logger.OutputHandler( tag="episodes", metric_names=['reward', 'steps', 'avg_reward']) tb.attach(engine, log_handler=episode_handler, event_name=ptan_ignite.EpisodeEvents.EPISODE_COMPLETED) # write to tensorboard every 100 iterations ptan_ignite.PeriodicEvents().attach(engine) handler = tb_logger.OutputHandler(tag="train", metric_names=['avg_loss', 'avg_fps'], output_transform=lambda a: a) tb.attach(engine,
gen_loss.backward() gen_optimizer.step() if trainer.state.iteration % SAVE_IMAGE_EVERY_ITER == 0: trainer.tb.writer.add_image( 'fake', vutils.make_grid(gen_output_v.data[:64], normalize=True), trainer.state.iteration) trainer.tb.writer.add_image( 'real', vutils.make_grid(batch_v.data[:64], normalize=True), trainer.state.iteration) return dis_loss.item(), gen_loss.item() engine = Engine(process_batch) tb = tb_logger.TensorboardLogger(log_dir=None) engine.tb = tb RunningAverage(output_transform=lambda out: out[0]).attach( engine, 'avg_loss_dis') RunningAverage(output_transform=lambda out: out[1]).attach( engine, 'avg_loss_gen') handler = tb_logger.OutputHandler( tag='train', metric_names=['avg_loss_dis', 'avg_loss_gen']) tb.attach(engine, log_handler=handler, event_name=Events.ITERATION_COMPLETED) @engine.on(Events.ITERATION_COMPLETED) def log_losses(trainer): if trainer.state.iteration % REPORT_EVERY_ITER == 0: log.info('Iter %d: gen_loss=%.3f, dis_loss=%.3f',
def setup_ignite( engine: Engine, params: SimpleNamespace, exp_source, run_name: str, model, optimizer, extra_metrics: Iterable[str] = (), ): warnings.simplefilter("ignore", category=UserWarning) handler = ptan_ignite.EndOfEpisodeHandler( exp_source, bound_avg_reward=params.stop_reward) handler.attach(engine) ptan_ignite.EpisodeFPSHandler().attach(engine) objects_to_checkpoint = { 'model': model, 'optimizer': optimizer, 'trainer': engine } checkpoint_dir = Path("models") saver = DiskSaver(str(checkpoint_dir), create_dir=True, require_empty=False) handler = Checkpoint(objects_to_checkpoint, saver, n_saved=2) engine.add_event_handler(Events.ITERATION_COMPLETED(every=1000), handler) checkpoints_paths = list(checkpoint_dir.iterdir()) if checkpoints_paths: checkpoint = torch.load(checkpoints_paths[-1]) print(f"Loading checkpoint {checkpoints_paths[-1].name}") Checkpoint.load_objects(to_load=objects_to_checkpoint, checkpoint=checkpoint) @engine.on(ptan_ignite.EpisodeEvents.EPISODE_COMPLETED) def episode_completed(trainer: Engine): passed = trainer.state.metrics.get('time_passed', 0) print("Episode %d: reward=%.2f, steps=%s, " "speed=%.1f f/s, elapsed=%s" % (trainer.state.episode, trainer.state.episode_reward, trainer.state.episode_steps, trainer.state.metrics.get('avg_fps', 0), timedelta(seconds=int(passed)))) @engine.on(ptan_ignite.EpisodeEvents.BOUND_REWARD_REACHED) def game_solved(trainer: Engine): passed = trainer.state.metrics['time_passed'] print("Game solved in %s, after %d episodes " "and %d iterations!" % (timedelta(seconds=int(passed)), trainer.state.episode, trainer.state.iteration)) trainer.should_terminate = True now = datetime.now().isoformat(timespec='minutes').replace(":", "-") logdir = f"runs/{now}-{params.run_name}-{run_name}" tb = tb_logger.TensorboardLogger(log_dir=logdir) run_avg = RunningAverage(output_transform=lambda v: v['loss']) run_avg.attach(engine, "avg_loss") metrics = ['reward', 'steps', 'avg_reward'] handler = tb_logger.OutputHandler(tag="episodes", metric_names=metrics) event = ptan_ignite.EpisodeEvents.EPISODE_COMPLETED tb.attach(engine, log_handler=handler, event_name=event) # write to tensorboard every 100 iterations ptan_ignite.PeriodicEvents().attach(engine) metrics = ['avg_loss', 'avg_fps'] metrics.extend(extra_metrics) handler = tb_logger.OutputHandler(tag="train", metric_names=metrics, output_transform=lambda a: a) event = ptan_ignite.PeriodEvents.ITERS_100_COMPLETED tb.attach(engine, log_handler=handler, event_name=event)
def main(): parser = argparse.ArgumentParser() parser.add_argument("--cuda", default=False, action="store_true", help="Enable cuda computation") args = parser.parse_args() device = torch.device("cuda" if args.cuda else "cpu") envs = [ InputWrapper(gym.make(name)) for name in ("Breakout-v0", "AirRaid-v0", "Pong-v0") ] input_shape = envs[0].observation_space.shape net_discr = Discriminator(input_shape=input_shape).to(device) net_gener = Generator(output_shape=input_shape).to(device) objective = nn.BCELoss() gen_optimizer = optim.Adam(params=net_gener.parameters(), lr=LEARNING_RATE, betas=(0.5, 0.999)) dis_optimizer = optim.Adam(params=net_discr.parameters(), lr=LEARNING_RATE, betas=(0.5, 0.999)) true_labels_v = torch.ones(BATCH_SIZE, device=device) fake_labels_v = torch.zeros(BATCH_SIZE, device=device) def process_batch(trainer, batch): gen_input_v = torch.FloatTensor(BATCH_SIZE, LATENT_VECTOR_SIZE, 1, 1) gen_input_v.normal_(0, 1) gen_input_v = gen_input_v.to(device) batch_v = batch.to(device) gen_output_v = net_gener(gen_input_v) # train discriminator dis_optimizer.zero_grad() dis_output_true_v = net_discr(batch_v) dis_output_fake_v = net_discr(gen_output_v.detach()) dis_loss = objective(dis_output_true_v, true_labels_v) + objective( dis_output_fake_v, fake_labels_v) dis_loss.backward() dis_optimizer.step() # train generator gen_optimizer.zero_grad() dis_output_v = net_discr(gen_output_v) gen_loss = objective(dis_output_v, true_labels_v) gen_loss.backward() gen_optimizer.step() if trainer.state.iteration % SAVE_IMAGE_EVERY_ITER == 0: fake_img = vutils.make_grid(gen_output_v.data[:64], normalize=True) trainer.tb.writer.add_image("fake", fake_img, trainer.state.iteration) real_img = vutils.make_grid(batch_v.data[:64], normalize=True) trainer.tb.writer.add_image("real", real_img, trainer.state.iteration) trainer.tb.writer.flush() return dis_loss.item(), gen_loss.item() engine = Engine(process_batch) tb = tb_logger.TensorboardLogger(log_dir=None) engine.tb = tb RunningAverage(output_transform=lambda out: out[1]).attach( engine, "avg_loss_gen") RunningAverage(output_transform=lambda out: out[0]).attach( engine, "avg_loss_dis") handler = tb_logger.OutputHandler( tag="train", metric_names=["avg_loss_gen", "avg_loss_dis"]) tb.attach(engine, log_handler=handler, event_name=Events.ITERATION_COMPLETED) @engine.on(Events.ITERATION_COMPLETED) def log_losses(trainer): if trainer.state.iteration % REPORT_EVERY_ITER == 0: log.info( "%d: gen_loss=%f, dis_loss=%f", trainer.state.iteration, trainer.state.metrics["avg_loss_gen"], trainer.state.metrics["avg_loss_dis"], ) engine.run(data=iterate_batches(envs))
def train(args, hyper_params): print(args) print(hyper_params) args.channels.sort( key=lambda x: src.dataset.Traffic4CastSample.channel_to_index[x]) model = MODELS[args.model_type](**filter_dict(hyper_params, "model")) slice_size = model.past + model.future assert model.future == 3 if args.model is not None: model_path = args.model model_name = os.path.basename(args.model) model.load(model_path) else: model_name = f"{args.model_type}_" + "_".join(args.channels + args.cities) model_path = f"output/models/{model_name}.pth" if model.num_channels != len(args.channels): print(f"ERROR: Model to channels missmatch. Model can predict " f"{model.num_channels} channels. {len(args.channels)} were " "selected.") sys.exit(1) transforms = [ lambda x: x.float(), lambda x: x / 255, src.dataset.Traffic4CastSample.Transforms.Permute("TCHW"), src.dataset.Traffic4CastSample.Transforms.SelectChannels( args.channels), ] train_dataset = src.dataset.Traffic4CastDataset(ROOT, "training", args.cities, transforms) valid_dataset = src.dataset.Traffic4CastDataset(ROOT, "validation", args.cities, transforms) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=1, collate_fn=src.dataset.Traffic4CastDataset.collate_list, shuffle=True) valid_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=1, collate_fn=src.dataset.Traffic4CastDataset.collate_list, shuffle=False) ignite_train = ignite_selected( train_loader, slice_size=slice_size, **filter_dict(hyper_params, "ignite_selected"), ) optimizer = torch.optim.Adam( model.parameters(), **filter_dict(hyper_params, "optimizer"), ) loss = nn.MSELoss() best_loss = 1.0 device = args.device if device.find('cuda') != -1 and not torch.cuda.is_available(): device = 'cpu' trainer = engine.create_supervised_trainer( model, optimizer, loss, device=device, prepare_batch=model.ignite_batch) evaluator = engine.create_supervised_evaluator( model, metrics={'loss': ignite.metrics.Loss(loss)}, device=device, prepare_batch=model.ignite_batch) @trainer.on(engine.Events.ITERATION_COMPLETED) def log_training_loss(trainer): print("Epoch {:3d} Train loss: {:8.6f}".format(trainer.state.epoch, trainer.state.output)) @trainer.on(engine.Events.EPOCH_COMPLETED) def log_validation_loss(trainer): evaluator.run(ignite_selected(valid_loader, slice_size=slice_size)) metrics = evaluator.state.metrics print("Epoch {:3d} Valid loss: {:8.6f} ←".format( trainer.state.epoch, metrics['loss'])) trainer.state.dataloader = ignite_selected(train_loader, slice_size=slice_size, **filter_dict( hyper_params, "ignite_selected")) nonlocal best_loss best_loss = min(best_loss, metrics['loss']) if "learning-rate-scheduler" in args.callbacks: lr_reduce = lr_scheduler.ReduceLROnPlateau(optimizer, verbose=args.verbose, **LR_REDUCE_PARAMS) @evaluator.on(engine.Events.COMPLETED) def update_lr_reduce(engine): loss = engine.state.metrics['loss'] lr_reduce.step(loss) def score_function(engine): return -engine.state.metrics['loss'] if "early-stopping" in args.callbacks: early_stopping_handler = ignite.handlers.EarlyStopping( patience=PATIENCE, score_function=score_function, trainer=trainer) evaluator.add_event_handler(engine.Events.EPOCH_COMPLETED, early_stopping_handler) if "model-checkpoint" in args.callbacks: checkpoint_handler = ignite.handlers.ModelCheckpoint( "output/models/checkpoints", model_name, score_function=score_function, n_saved=1, require_empty=False, create_dir=True) evaluator.add_event_handler(engine.Events.EPOCH_COMPLETED, checkpoint_handler, {"model": model}) if "tensorboard" in args.callbacks: logger = tensorboard_logger.TensorboardLogger( log_dir=f"output/tensorboard/{model_name}") logger.attach(trainer, log_handler=tensorboard_logger.OutputHandler( tag="training", output_transform=lambda loss: {'loss': loss}), event_name=engine.Events.ITERATION_COMPLETED) logger.attach(evaluator, log_handler=tensorboard_logger.OutputHandler( tag="validation", metric_names=["loss"], another_engine=trainer), event_name=engine.Events.EPOCH_COMPLETED) trainer.run(ignite_train, **filter_dict(hyper_params, "trainer_run")) if "save-model" in args.callbacks and not "model-checkpoint" in args.callbacks: torch.save(model.state_dict(), model_path) print("Model saved at:", model_path) elif "save-model" in args.callbacks: # Move best model from checkpoint directory to output/models checkpoints_dir = "output/models/checkpoints" source, *_ = [ f for f in reversed(utils.sorted_ls(checkpoints_dir)) if f.startswith(model_name) ] # get most recent model os.rename(os.path.join(checkpoints_dir, source), model_path) print("Model saved at:", model_path) return { 'loss': best_loss, # HpBandSter always minimizes! 'info': { 'args': vars(args), 'hyper-params': hyper_params, }, }
def fit(self, train_loader: _data.DataLoader, val_loader: _data.DataLoader, epochs: int = 1, batches: int = None, learning_rate: float = 1e-3) -> None: if batches is None: batches = VocalExtractor.get_number_of_batches(train_loader) loss_fn = nn.BCELoss() optimizer = _optim.Adam(self.model.parameters(), lr=learning_rate) trainer = _engine.create_supervised_trainer(self.model, optimizer, loss_fn, device=self.device) _metrics.RunningAverage(output_transform=lambda x: x, device=self.device).attach(trainer, 'loss') progressbar = _chandlers.ProgressBar( bar_format= "{desc}[{n_fmt}/{total_fmt}] {percentage:3.0f}%|{bar:20}| " "[{elapsed}<{remaining}]{postfix}", persist=True, ascii=" #") progressbar.attach(trainer, ['loss']) def get_metrics_fn() -> Dict[str, _metrics.Metric]: def rounded_transform(output): y_pred, y = output return torch.round(y_pred), y transform = rounded_transform accuracy = _metrics.Accuracy(transform, device=self.device) precision = _metrics.Precision(transform, device=self.device) recall = _metrics.Recall(transform, device=self.device) f1 = precision * recall * 2 / (precision + recall + 1e-20) return { 'loss': _metrics.Loss(loss_fn), 'accuracy': accuracy, 'precision': precision, 'recall': recall, 'f1': f1 } evaluator = _engine.create_supervised_evaluator( self.model, metrics=get_metrics_fn(), device=self.device) score_fn_name = "f1" def score_function(engine: _engine.Engine): return engine.state.metrics[score_fn_name] best_model_saver = _handlers.ModelCheckpoint( dirname="best_models", filename_prefix="vocal_extractor", score_name=score_fn_name, score_function=score_function, n_saved=5, create_dir=True) evaluator.add_event_handler(_engine.Events.COMPLETED, best_model_saver, {"model": self.model}) each_model_saver = _handlers.ModelCheckpoint( dirname="all_models", filename_prefix="vocal_extractor", score_name=score_fn_name, score_function=score_function, n_saved=None, create_dir=True) evaluator.add_event_handler(_engine.Events.COMPLETED, each_model_saver, {"model": self.model}) @trainer.on(_engine.Events.EPOCH_COMPLETED) def on_epoch_completed(engine: _engine.Engine) -> None: metrics = VocalExtractor.compute_metrics(val_loader, evaluator) string = ", ".join(f"val_{k}: {v:.4f}" for k, v in metrics.items()) progressbar.log_message(string + "\n") with _tb_logger.TensorboardLogger(log_dir="tb_logs") as tb_logger: global_step = _tb_logger.global_step_from_engine(trainer) train_running_loss_log_handler = _tb_logger.OutputHandler( tag="training", output_transform=lambda x: {'running_loss': x}) tb_logger.attach(trainer, log_handler=train_running_loss_log_handler, event_name=_engine.Events.ITERATION_COMPLETED) val_metrics_log_handler = _tb_logger.OutputHandler( tag="validation", metric_names=[name for name, _ in get_metrics_fn().items()], global_step_transform=global_step) tb_logger.attach(evaluator, log_handler=val_metrics_log_handler, event_name=_engine.Events.EPOCH_COMPLETED) tb_logger.attach( trainer, log_handler=_tb_logger.OptimizerParamsHandler(optimizer), event_name=_engine.Events.ITERATION_STARTED) tb_logger.attach(trainer, log_handler=_tb_logger.WeightsScalarHandler( self.model), event_name=_engine.Events.ITERATION_COMPLETED) tb_logger.attach(trainer, log_handler=_tb_logger.WeightsHistHandler( self.model), event_name=_engine.Events.EPOCH_COMPLETED) tb_logger.attach(trainer, log_handler=_tb_logger.GradsScalarHandler( self.model), event_name=_engine.Events.ITERATION_COMPLETED) tb_logger.attach(trainer, log_handler=_tb_logger.GradsHistHandler( self.model), event_name=_engine.Events.EPOCH_COMPLETED) torchsummary.summary(self.model, input_size=(1, self.freq_bins, self.time_bins), batch_size=train_loader.batch_size, device=self.device) trainer.run(data=train_loader, epoch_length=batches, max_epochs=epochs)
def _init_tb_logger(self) -> t.Optional[tbl.TensorboardLogger]: return tbl.TensorboardLogger( log_dir=(ct.WORK_ROOT / self.opts.run_dir / 'logs').as_posix(), )
def train(): learning_rate = 0.0001 save_on_iter_count = 100 device = "cuda" envs = [ ObservationScaler(gym.make(name)) for name in ("Breakout-v0", "Pong-v0", "AirRaid-v0") ] discriminator = Discriminator(img_size=64).to(device) generator = Generator().to(device) objective = nn.BCELoss() discr_optimizer = optim.Adam(params=discriminator.parameters(), lr=learning_rate, betas=(0.5, 0.999)) gen_optimizer = optim.Adam(params=generator.parameters(), lr=learning_rate, betas=(0.5, 0.999)) def process_batch(trainer, batch): batch_size = batch.shape[0] gen_input_size = 10 # get labels and inputs generator_inputs = torch.randn( (batch_size, gen_input_size, 1, 1)).to(device) fake_inputs = generator(generator_inputs).to(device) true_inputs = batch.to(device) fake_image_labels = torch.zeros((batch_size, )).to(device) true_image_labels = torch.ones((batch_size, )).to(device) # train discriminator discr_optimizer.zero_grad() discr_fake_image_output = discriminator(fake_inputs.detach()) discr_true_image_output = discriminator(true_inputs) discr_loss = objective(discr_fake_image_output, fake_image_labels) + objective( discr_true_image_output, true_image_labels) discr_loss.backward() discr_optimizer.step() # train generator gen_optimizer.zero_grad() discr_output = discriminator(fake_inputs) gen_loss = objective(discr_output, true_image_labels) gen_loss.backward() gen_optimizer.step() # save images if trainer.state.iteration % save_on_iter_count == 0: fake_img = vutils.make_grid(fake_inputs.data[:64], normalize=True) trainer.tb.writer.add_image("fake", fake_img, trainer.state.iteration) real_img = vutils.make_grid(true_inputs.data[:64], normalize=True) trainer.tb.writer.add_image("real", real_img, trainer.state.iteration) trainer.tb.writer.flush() return discr_loss.item(), gen_loss.item() engine = Engine(process_batch) tb = tb_logger.TensorboardLogger(log_dir=None) engine.tb = tb RunningAverage(output_transform=lambda out: out[1]).attach( engine, "avg_loss_gen") RunningAverage(output_transform=lambda out: out[0]).attach( engine, "avg_loss_dis") handler = tb_logger.OutputHandler( tag="train", metric_names=["avg_loss_gen", "avg_loss_dis"]) tb.attach(engine, log_handler=handler, event_name=Events.ITERATION_COMPLETED) @engine.on(Events.ITERATION_COMPLETED(every=100)) def log_training_loss(engine): print(f"Epoch[{engine.state.iteration}] Loss:", engine.state.output) engine.run(data=generate_batch(envs))