def test_concat_scheduler_asserts(): tensor = torch.zeros([1], requires_grad=True) optimizer = torch.optim.SGD([tensor], lr=0) scheduler_1 = LinearCyclicalScheduler(optimizer, "lr", start_value=1.0, end_value=0.0, cycle_size=10) scheduler_2 = CosineAnnealingScheduler(optimizer, "lr", start_value=0.0, end_value=1.0, cycle_size=10) with pytest.raises(ValueError): ConcatScheduler(schedulers=[], durations=[]) with pytest.raises(ValueError): ConcatScheduler(schedulers=[scheduler_1], durations=[10]) with pytest.raises(TypeError): ConcatScheduler(schedulers=[scheduler_1, 12], durations=[10]) with pytest.raises(ValueError): ConcatScheduler(schedulers=[scheduler_1, scheduler_2], durations=[10, 5]) with pytest.raises(ValueError): ConcatScheduler(schedulers=[scheduler_1, scheduler_2, scheduler_2], durations=[15, 12.0]) with pytest.raises(ValueError): ConcatScheduler(schedulers=[scheduler_1, scheduler_2], durations="abc") with pytest.raises(ValueError): ConcatScheduler.simulate_values( num_events=123, schedulers=[scheduler_1, scheduler_2], durations=[15], param_names="abc" )
def test_concat_scheduler_state_dict(): tensor = torch.zeros([1], requires_grad=True) optimizer = torch.optim.SGD([tensor], lr=0) scheduler_1 = LinearCyclicalScheduler(optimizer, "lr", start_value=1.0, end_value=0.0, cycle_size=10) scheduler_2 = CosineAnnealingScheduler(optimizer, "lr", start_value=0.0, end_value=1.0, cycle_size=10) durations = [10] concat_scheduler = ConcatScheduler(schedulers=[scheduler_1, scheduler_2], durations=durations, save_history=False) state_dict = concat_scheduler.state_dict() assert state_dict["durations"] == durations assert state_dict["_current_duration"] == durations[0] assert state_dict["_scheduler_index"] == 0 for _ in range(20): concat_scheduler(None, None) concat_scheduler.load_state_dict(state_dict) assert concat_scheduler.durations == durations assert concat_scheduler._current_duration == durations[0] assert id(concat_scheduler._current_scheduler) == id(scheduler_1) with pytest.raises(ValueError, match=r"Required state attribute 'schedulers' is absent in provided state_dict"): concat_scheduler.load_state_dict({"a": 1}) with pytest.raises(ValueError, match=r"Input state_dict contains 0 state_dicts of concatenated schedulers"): concat_scheduler.load_state_dict({"schedulers": []}) with pytest.raises(TypeError, match=r"Argument state_dict should be a dictionary, but given"): concat_scheduler.load_state_dict(None)
def test_concat_scheduler_asserts(): tensor = torch.zeros([1], requires_grad=True) optimizer = torch.optim.SGD([tensor], lr=0) scheduler_1 = LinearCyclicalScheduler(optimizer, "lr", start_value=1.0, end_value=0.0, cycle_size=10) scheduler_2 = CosineAnnealingScheduler(optimizer, "lr", start_value=0.0, end_value=1.0, cycle_size=10) with pytest.raises(TypeError, match=r"Argument schedulers should be a sequence"): ConcatScheduler(schedulers=None, durations=[]) with pytest.raises(ValueError, match=r"Argument schedulers should be of more than one parameter schedulers"): ConcatScheduler(schedulers=[], durations=[]) with pytest.raises(ValueError, match=r"Argument schedulers should be of more than one parameter schedulers"): ConcatScheduler(schedulers=[scheduler_1], durations=[10]) with pytest.raises(TypeError, match=r"Value at index 1 of schedulers should be a parameter scheduler"): ConcatScheduler(schedulers=[scheduler_1, 12], durations=[10]) with pytest.raises(ValueError, match=r"Incorrect number schedulers or duration values"): ConcatScheduler(schedulers=[scheduler_1, scheduler_2], durations=[10, 5]) with pytest.raises(ValueError, match=r"Argument durations should be list/tuple of integers"): ConcatScheduler(schedulers=[scheduler_1, scheduler_2, scheduler_2], durations=[15, 12.0]) with pytest.raises(TypeError, match=r"Argument durations should be list/tuple"): ConcatScheduler(schedulers=[scheduler_1, scheduler_2], durations="abc") with pytest.raises(TypeError, match=r"Argument param_names should be list or tuple"): ConcatScheduler.simulate_values( num_events=123, schedulers=[scheduler_1, scheduler_2], durations=[15], param_names="abc" ) with pytest.raises(ValueError, match=r"Argument param_names should be list or tuple of strings"): ConcatScheduler.simulate_values( num_events=123, schedulers=[scheduler_1, scheduler_2], durations=[15], param_names=[1] ) optimizer_2 = torch.optim.SGD([tensor], lr=0) scheduler_3 = CosineAnnealingScheduler(optimizer_2, "lr", start_value=0.0, end_value=1.0, cycle_size=10) with pytest.raises(ValueError, match=r"schedulers should be related to same optimizer"): ConcatScheduler([scheduler_1, scheduler_3], durations=[30,])
def test_cosine_annealing_scheduler(): tensor = torch.zeros([1], requires_grad=True) optimizer = torch.optim.SGD([tensor], lr=0) scheduler = CosineAnnealingScheduler(optimizer, "lr", 0, 1, 10) state_dict = scheduler.state_dict() data = [0] * 9 max_epochs = 2 simulated_values = CosineAnnealingScheduler.simulate_values( num_events=len(data) * max_epochs, param_name="lr", start_value=0, end_value=1, cycle_size=10, ) def save_lr(engine): lrs.append(optimizer.param_groups[0]["lr"]) trainer = Engine(lambda engine, batch: None) trainer.add_event_handler(Events.ITERATION_STARTED, scheduler) trainer.add_event_handler(Events.ITERATION_COMPLETED, save_lr) for _ in range(2): lrs = [] trainer.run(data, max_epochs=max_epochs) assert lrs == list( map( pytest.approx, [ 0.0, 0.02447174185242318, 0.09549150281252627, 0.20610737385376332, 0.3454915028125263, 0.5, 0.6545084971874737, 0.7938926261462365, 0.9045084971874737, 0.9755282581475768, 0.0, 0.02447174185242318, 0.09549150281252627, 0.20610737385376332, 0.3454915028125263, 0.5, 0.6545084971874737, 0.7938926261462365, # 0.9045084971874737, 0.9755282581475768 ], )) scheduler.load_state_dict(state_dict) assert lrs == pytest.approx([v for i, v in simulated_values])
def test_scheduler_with_param_groups(): def _test(lr_scheduler, optimizer): num_iterations = 10 max_epochs = 20 state_dict = lr_scheduler.state_dict() trainer = Engine(lambda engine, batch: None) @trainer.on(Events.ITERATION_COMPLETED) def save_lr(): lrs.append((optimizer.param_groups[0]["lr"], optimizer.param_groups[1]["lr"])) trainer.add_event_handler(Events.ITERATION_STARTED, lr_scheduler) data = [0] * num_iterations for _ in range(2): lrs = [] trainer.run(data, max_epochs=max_epochs) assert [lr[0] for lr in lrs] == pytest.approx([lr[1] for lr in lrs]) lr_scheduler.load_state_dict(state_dict) t1 = torch.zeros([1], requires_grad=True) t2 = torch.zeros([1], requires_grad=True) optimizer = torch.optim.SGD([{"params": t1, "lr": 0.1}, {"params": t2, "lr": 0.1}]) lr_scheduler = LinearCyclicalScheduler(optimizer, "lr", start_value=1.0, end_value=0.0, cycle_size=10) _test(lr_scheduler, optimizer) lr_scheduler = PiecewiseLinear( optimizer, "lr", milestones_values=[(5, 0.5), (15, 1.0), (25, 0.0), (35, 1.0), (40, 0.5)] ) _test(lr_scheduler, optimizer) lr_scheduler = CosineAnnealingScheduler(optimizer, "lr", start_value=0.0, end_value=1.0, cycle_size=10) _test(lr_scheduler, optimizer) torch_lr_scheduler = ExponentialLR(optimizer, gamma=0.98) _test(LRScheduler(torch_lr_scheduler), optimizer) torch_lr_scheduler = StepLR(optimizer, step_size=50, gamma=0.5) _test(LRScheduler(torch_lr_scheduler), optimizer)
def _test(duration_vals_as_np_int): scheduler_1 = LinearCyclicalScheduler(optimizer, "lr", start_value=1.0, end_value=0.0, cycle_size=10) scheduler_2 = CosineAnnealingScheduler(optimizer, "lr", start_value=0.0, end_value=1.0, cycle_size=10) durations = [10, ] if duration_vals_as_np_int: durations = [np.int64(t) for t in durations] concat_scheduler = ConcatScheduler(schedulers=[scheduler_1, scheduler_2], durations=durations, save_history=True) data = [0] * 10 max_epochs = 2 simulated_values = ConcatScheduler.simulate_values(num_events=len(data) * max_epochs, schedulers=[scheduler_1, scheduler_2], durations=durations) lrs = [] def save_lr(engine): lrs.append(optimizer.param_groups[0]['lr']) trainer = Engine(lambda engine, batch: None) trainer.add_event_handler(Events.ITERATION_STARTED, concat_scheduler) trainer.add_event_handler(Events.ITERATION_COMPLETED, save_lr) trainer.run(data, max_epochs=max_epochs) assert lrs == list(map(pytest.approx, [ # Cycle 1 of the LinearCyclicalScheduler 1.0, 0.8, 0.6, 0.4, 0.2, 0.0, 0.2, 0.4, 0.6, 0.8, # Cycle 1 of the CosineAnnealingScheduler 0.0, 0.02447174185242318, 0.09549150281252627, 0.20610737385376332, 0.3454915028125263, 0.5, 0.6545084971874737, 0.7938926261462365, 0.9045084971874737, 0.9755282581475768, ])) state_lrs = trainer.state.param_history['lr'] assert len(state_lrs) == len(lrs) # Unpack singleton lists assert [group[0] for group in state_lrs] == lrs assert lrs == pytest.approx([v for i, v in simulated_values])
def test_cosine_annealing_scheduler(): tensor = torch.zeros([1], requires_grad=True) optimizer = torch.optim.SGD([tensor], lr=0) scheduler = CosineAnnealingScheduler(optimizer, 'lr', 0, 1, 10) lrs = [] def save_lr(engine): lrs.append(optimizer.param_groups[0]['lr']) trainer = Engine(lambda engine, batch: None) trainer.add_event_handler(Events.ITERATION_COMPLETED, scheduler) trainer.add_event_handler(Events.ITERATION_COMPLETED, save_lr) trainer.run([0] * 10, max_epochs=2) assert lrs == list(map(pytest.approx, [ 0.0, 0.02447174185242318, 0.09549150281252627, 0.20610737385376332, 0.3454915028125263, 0.5, 0.6545084971874737, 0.7938926261462365, 0.9045084971874737, 0.9755282581475768, 0.0, 0.02447174185242318, 0.09549150281252627, 0.20610737385376332, 0.3454915028125263, 0.5, 0.6545084971874737, 0.7938926261462365, 0.9045084971874737, 0.9755282581475768 ]))
print(model) model.to(device) # multi-gpus if torch.cuda.device_count(): print('==================== Use {} GPUs ===================='.format(torch.cuda.device_count())) model = nn.DataParallel(model) # loss function loss_fn = nn.CrossEntropyLoss() # optimizer optimizer = optim.SGD(model.parameters(), lr=init_lr, momentum=0.9, weight_decay=5e-4) # scheduler scheduler = CosineAnnealingScheduler(optimizer, 'lr', init_lr, end_lr, 4*len(trainloader), cycle_mult=1.5, start_value_mult=0.1) scheduler = create_lr_scheduler_with_warmup(scheduler, warmup_start_value=0., warmup_end_value=init_lr, warmup_duration=len(trainloader)) # create trainer trainer = create_trainer(model, optimizer, loss_fn, device=device) trainer.add_event_handler(Events.ITERATION_STARTED, scheduler) # add timer for each iteration timer = Timer(average=False) # logging training loss def log_loss(engine): i = engine.state.iteration e = engine.state.epoch if i % 100 == 0:
checkpointer = ModelCheckpoint( CHECKPOINTS_RUN_DIR_PATH, filename_prefix=RUN_NAME.lower(), n_saved=None, score_function=lambda engine: round(engine.state.metrics['WRA'], 3), score_name='WRA', atomic=True, require_empty=True, create_dir=True, archived=False, global_step_transform=global_step_from_engine(trainer)) nan_handler = TerminateOnNan() coslr = CosineAnnealingScheduler(opt, "lr", start_value=LR, end_value=LR / 4, cycle_size=TOTAL_UPDATE_STEPS // 1) evaluator.add_event_handler(Events.EPOCH_COMPLETED, checkpointer, {'_': mude}) trainer.add_event_handler(Events.ITERATION_COMPLETED, nan_handler) trainer.add_event_handler(Events.ITERATION_COMPLETED, coslr) GpuInfo().attach(trainer, name='gpu') pbar.attach(trainer, output_transform=lambda output: {'loss': output['loss']}, metric_names=[f"gpu:{args.gpu} mem(%)"]) # FIRE
def test_simulate_values(): def _test(scheduler_cls, **scheduler_kwargs): optimizer = None if scheduler_cls == LRScheduler: scheduler_kwargs['optimizer'] = scheduler_kwargs[ 'lr_scheduler'].optimizer optimizer = scheduler_kwargs['optimizer'] elif scheduler_cls == ConcatScheduler: optimizer = scheduler_kwargs['optimizer'] del scheduler_kwargs['optimizer'] else: tensor = torch.zeros([1], requires_grad=True) scheduler_kwargs['optimizer'] = torch.optim.SGD([tensor], lr=0.1) optimizer = scheduler_kwargs['optimizer'] max_epochs = 2 data = [0] * 10 simulated_values = scheduler_cls.simulate_values(num_events=len(data) * max_epochs, **scheduler_kwargs) scheduler = scheduler_cls(**scheduler_kwargs) lrs = [] def save_lr(engine): lrs.append(optimizer.param_groups[0]['lr']) trainer = Engine(lambda engine, batch: None) trainer.add_event_handler(Events.ITERATION_STARTED, scheduler) trainer.add_event_handler(Events.ITERATION_COMPLETED, save_lr) trainer.run(data, max_epochs=max_epochs) assert lrs == pytest.approx([v for i, v in simulated_values]) if scheduler_cls == LRScheduler or scheduler_cls == ConcatScheduler: # As internal state of torch lr scheduler has been changed the following checks will fail return # reexecute to check if no internal changes simulated_values = scheduler_cls.simulate_values( num_events=len(data) * max_epochs, save_history=True, # this will be removed **scheduler_kwargs) assert lrs == pytest.approx([v for i, v in simulated_values]) # LinearCyclicalScheduler _test(LinearCyclicalScheduler, param_name="lr", start_value=1.0, end_value=0.0, cycle_size=10) # CosineAnnealingScheduler _test(CosineAnnealingScheduler, param_name="lr", start_value=1.0, end_value=0.0, cycle_size=10) # LRScheduler tensor = torch.zeros([1], requires_grad=True) optimizer = torch.optim.SGD([tensor], lr=0.1) torch_lr_scheduler = torch.optim.lr_scheduler.ExponentialLR( optimizer=optimizer, gamma=0.5) _test(LRScheduler, lr_scheduler=torch_lr_scheduler) # ConcatScheduler = [LinearCyclicalScheduler, CosineAnnealingScheduler] scheduler_1 = LinearCyclicalScheduler(optimizer, "lr", start_value=1.0, end_value=0.0, cycle_size=20) scheduler_2 = CosineAnnealingScheduler(optimizer, "lr", start_value=0.0, end_value=1.0, cycle_size=10) durations = [ 10, ] _test(ConcatScheduler, optimizer=optimizer, schedulers=[scheduler_1, scheduler_2], durations=durations) # ConcatScheduler = [LinearCyclicalScheduler, LRScheduler] tensor = torch.ones([1], requires_grad=True) optimizer = torch.optim.SGD([tensor], lr=0.001) torch_lr_scheduler = torch.optim.lr_scheduler.ExponentialLR( optimizer=optimizer, gamma=1.5) scheduler_1 = LRScheduler(torch_lr_scheduler) scheduler_2 = LinearCyclicalScheduler(optimizer, "lr", start_value=0.1, end_value=0.0, cycle_size=10) durations = [ 10, ] _test(ConcatScheduler, optimizer=optimizer, schedulers=[scheduler_1, scheduler_2], durations=durations)
def train(cfg): print(cfg.pretty()) ################################################################### # Dataset ################################################################### wt = Dataset(batch_size=cfg.train.batch_size, bptt_len=cfg.train.bptt_len, dataset_cls=hydra.utils.get_class(cfg.dataset.name)) ################################################################### # Models ################################################################### base_embedding = hydra.utils.instantiate(cfg.embedding, ntokens=len(wt.text_field.vocab) + 3) embedding = TransformerEmbedding( embedding=base_embedding, max_length=cfg.train.bptt_len, embedding_size=base_embedding.embedding_size, use_positional_embedding=False) encoder = TransformerEncoder(query_dim=cfg.encoder.query_dim, att_num_units=cfg.encoder.att_num_units, ffn_num_unit=cfg.encoder.ffn_num_unit, max_ext=cfg.encoder.max_ext) model = TransformerLanguageModel(embedding, encoder) model.init_weight() # wandb.watch(model) ################################################################### # Loss ################################################################### criterion = lm_criterion(in_features=cfg.encoder.att_num_units[-1], vocab_size=len(wt.text_field.vocab)) ################################################################### # Parameters + Train ops ################################################################### parameters = (list(model.parameters()) + list(criterion.parameters())) tot_params = 0 for p in parameters: tot_params += reduce(lambda x, y: x * y, p.size()) print("Total Parameters: ", tot_params) opt = optim.Adam(parameters, lr=cfg.train.lr) model.to(DEVICE) criterion.to(DEVICE) ################################################################### # Train + Evaluation ################################################################### def train_step(engine, batch): model.train() opt.zero_grad() text = batch.text.to(DEVICE).t().contiguous() target = batch.target.to(DEVICE).t().contiguous() out, out_past = model(text, engine.state.train_past) engine.state.train_past = out_past raw_loss = criterion(out.view(-1, out.size(2)), target.view(-1)) loss = raw_loss[1] loss.backward() nn.utils.clip_grad_norm_(parameters, cfg.train.clip_grad) opt.step() return {"train_loss": loss.item(), "train_ppl": loss.exp().item()} def eval_step(engine, batch): model.eval() if not hasattr(engine.state, "eval_past"): engine.state.eval_past = None target_sample = [] result_sample = [] with torch.no_grad(): text = batch.text.to(DEVICE).t().contiguous() target = batch.target.to(DEVICE).t().contiguous() out, out_past = model(text, engine.state.eval_past) vocab = wt.text_field.vocab idx = list(range(32)) sample = random.choices(idx, k=5) for id_sample in sample: s = [] for target_id in target[id_sample]: s.append(vocab.itos[target_id]) target_sample.append(" ".join(s)) s = [] for result_id in out.max(-1)[1][id_sample]: s.append(vocab.itos[result_id]) result_sample.append(" ".join(s)) # engine.state.eval_past = out_past raw_loss = criterion(out.view(-1, out.size(2)), target.view(-1)) loss = raw_loss[1] return { "val_loss": loss.item(), "sample": (target_sample, result_sample) } train_engine = Engine(train_step) eval_engine = Engine(eval_step) def reset_state(engine): engine.state.train_past = None def run_eval(_): print("start running eval") eval_engine.run(wt.valid_iter) metrics = eval_engine.state.metrics print("Validation loss: ", metrics["val_loss"], ", ppl: ", np.exp(metrics["val_loss"])) train_engine.add_event_handler(Events.EPOCH_STARTED, reset_state) train_engine.add_event_handler(Events.EPOCH_COMPLETED, run_eval) ################################################################### # LR Scheduler ################################################################### cosine_scheduler = CosineAnnealingScheduler(opt.param_groups[0], "lr", 0.0, 2.5e-4, cycle_size=len(wt.train_iter)) warmup_scheduler = create_lr_scheduler_with_warmup(cosine_scheduler, 0.0, 2.5e-4, 200) train_engine.add_event_handler(Events.ITERATION_STARTED, warmup_scheduler) ################################################################### # Metrics ################################################################### RunningAverage(output_transform=lambda x: x["train_ppl"]).attach( train_engine, "train_ppl") RunningAverage(output_transform=lambda x: x["train_loss"]).attach( train_engine, "train_loss") RunningAverage(output_transform=lambda x: x["val_loss"]).attach( eval_engine, "val_loss") progress_bar = ProgressBar(persist=True) progress_bar.attach(train_engine, ["train_ppl", "train_loss"]) progress_bar_val = ProgressBar(persist=True) progress_bar_val.attach(eval_engine, ["val_loss"]) ################################################################### # Tensorboard ################################################################### # tb_logger = TensorboardLogger(log_dir=log_dir) tb_logger = WandbLogger(project="language_model", entity="akurniawan") tb_logger.watch(model) def stepn_logger(num_steps, handler): def logger_runner(engine, log_handler, event_name): if engine.state.iteration % num_steps == 0: handler(engine, log_handler, event_name) return logger_runner tb_logger.attach(train_engine, log_handler=stepn_logger( cfg.train.log_steps, OutputHandler(tag="training", output_transform=lambda loss: loss)), event_name=Events.ITERATION_COMPLETED) tb_logger.attach(eval_engine, log_handler=OutputHandler( tag="validation", output_transform=lambda loss: loss, another_engine=train_engine), event_name=Events.EPOCH_COMPLETED) # tb_logger.attach(train_engine, # log_handler=stepn_logger(log_steps, # OptimizerParamsHandler(opt)), # event_name=Events.ITERATION_STARTED) # tb_logger.attach(train_engine, # log_handler=stepn_logger(log_steps, # WeightsScalarHandler(model)), # event_name=Events.ITERATION_COMPLETED) # tb_logger.attach(train_engine, # log_handler=stepn_logger(log_steps, # GradsScalarHandler(model)), # event_name=Events.ITERATION_COMPLETED) # tb_logger.attach(train_engine, # log_handler=stepn_logger(500, WeightsHistHandler(model)), # event_name=Events.ITERATION_COMPLETED) # tb_logger.attach(train_engine, # log_handler=stepn_logger(500, GradsHistHandler(model)), # event_name=Events.ITERATION_COMPLETED) try: train_engine.run(wt.train_iter, max_epochs=cfg.train.epochs) except Exception: pass finally: tb_logger.close()
def train(epochs=500, batch_size=32, bptt_len=70, lr=0.00025, log_steps=200, clip_grad=0.25, log_dir="experiments"): ################################################################### # Dataset ################################################################### wt = wikitext103(batch_size=batch_size, bptt_len=bptt_len) # wt = wikitext2(batch_size=batch_size, bptt_len=bptt_len) ################################################################### # Configs ################################################################### embedding_config = DropEmbedding.Hyperparams(len(wt.text_field.vocab) + 3, ninp=512) encoder_config = TransformerEncoder.Hyperparams( att_num_units=[512, 512, 512, 512, 512, 512], max_ext=384) ################################################################### # Models ################################################################### base_embedding = DropEmbedding(embedding_config) embedding = TransformerEmbedding(embedding=base_embedding, max_length=bptt_len, embedding_size=embedding_config.ninp, use_positional_embedding=False) encoder = TransformerEncoder(encoder_config) model = TransformerLanguageModel(embedding, encoder) model.init_weight() ################################################################### # Loss ################################################################### criterion = lm_criterion(in_features=encoder_config.att_num_units[-1], vocab_size=len(wt.text_field.vocab)) ################################################################### # Parameters + Train ops ################################################################### parameters = (list(model.parameters()) + list(criterion.parameters())) tot_params = 0 for p in parameters: tot_params += reduce(lambda x, y: x * y, p.size()) print("Total Parameters: ", tot_params) opt = optim.Adam(parameters, lr=lr) model.to(DEVICE) criterion.to(DEVICE) ################################################################### # Train + Evaluation ################################################################### def train_step(engine, batch): model.train() opt.zero_grad() text = batch.text.to(DEVICE).t().contiguous() target = batch.target.to(DEVICE).t().contiguous() out, out_past = model(text, engine.state.train_past) engine.state.train_past = out_past raw_loss = criterion(out.view(-1, out.size(2)), target.view(-1)) loss = raw_loss[1] loss.backward() nn.utils.clip_grad_norm_(parameters, clip_grad) opt.step() return {"train_loss": loss.item(), "train_ppl": loss.exp().item()} def eval_step(engine, batch): model.eval() if not hasattr(engine.state, "eval_past"): engine.state.eval_past = None with torch.no_grad(): text = batch.text.to(DEVICE).t().contiguous() target = batch.target.to(DEVICE).t().contiguous() out, out_past = model(text, engine.state.eval_past) engine.state.eval_past = out_past raw_loss = criterion(out.view(-1, out.size(2)), target.view(-1)) loss = raw_loss[1] return {"val_loss": loss.item()} train_engine = Engine(train_step) eval_engine = Engine(eval_step) def reset_state(engine): engine.state.train_past = None def run_eval(_): print("start running eval") eval_engine.run(wt.valid_iter) metrics = eval_engine.state.metrics print("Validation loss: ", metrics["val_loss"], ", ppl: ", np.exp(metrics["val_loss"])) train_engine.add_event_handler(Events.EPOCH_STARTED, reset_state) train_engine.add_event_handler(Events.EPOCH_COMPLETED, run_eval) ################################################################### # LR Scheduler ################################################################### cosine_scheduler = CosineAnnealingScheduler(opt.param_groups[0], "lr", 0.0, 2.5e-4, cycle_size=len(wt.train_iter)) warmup_scheduler = create_lr_scheduler_with_warmup(cosine_scheduler, 0.0, 2.5e-4, 200) train_engine.add_event_handler(Events.ITERATION_STARTED, warmup_scheduler) ################################################################### # Metrics ################################################################### RunningAverage(output_transform=lambda x: x["train_ppl"]).attach( train_engine, "train_ppl") RunningAverage(output_transform=lambda x: x["train_loss"]).attach( train_engine, "train_loss") RunningAverage(output_transform=lambda x: x["val_loss"]).attach( eval_engine, "val_loss") progress_bar = ProgressBar(persist=True) progress_bar.attach(train_engine, ["train_ppl", "train_loss"]) progress_bar_val = ProgressBar(persist=True) progress_bar_val.attach(eval_engine, ["val_loss"]) ################################################################### # Tensorboard ################################################################### tb_logger = TensorboardLogger(log_dir=log_dir) def stepn_logger(num_steps, handler): def logger_runner(engine, log_handler, event_name): if engine.state.iteration % num_steps == 0: handler(engine, log_handler, event_name) return logger_runner tb_logger.attach(train_engine, log_handler=stepn_logger( log_steps, OutputHandler(tag="training", output_transform=lambda loss: loss)), event_name=Events.ITERATION_COMPLETED) tb_logger.attach(eval_engine, log_handler=OutputHandler( tag="validation", output_transform=lambda loss: loss, another_engine=train_engine), event_name=Events.EPOCH_COMPLETED) tb_logger.attach(train_engine, log_handler=stepn_logger(log_steps, OptimizerParamsHandler(opt)), event_name=Events.ITERATION_STARTED) tb_logger.attach(train_engine, log_handler=stepn_logger(log_steps, WeightsScalarHandler(model)), event_name=Events.ITERATION_COMPLETED) tb_logger.attach(train_engine, log_handler=stepn_logger(log_steps, GradsScalarHandler(model)), event_name=Events.ITERATION_COMPLETED) tb_logger.attach(train_engine, log_handler=stepn_logger(500, WeightsHistHandler(model)), event_name=Events.ITERATION_COMPLETED) tb_logger.attach(train_engine, log_handler=stepn_logger(500, GradsHistHandler(model)), event_name=Events.ITERATION_COMPLETED) try: train_engine.run(wt.train_iter, max_epochs=epochs) except Exception: pass finally: tb_logger.close()
def run(config): if config['model'] == 'AttentionWideResNet': train_loader, val_loader = get_data_loaders(config['batch_size']) model = AttentionWideResNet(28, 100, 10, (32, 32), 0.0) elif config['model'] == 'AttentionRetinaNet': train_loader, val_loader = get_COCO_loaders(config['batch_size']) model = AttentionRetinaNet(num_classes=80, input_size=(5,3)) writer = create_summary_writer(model, train_loader, config["tb_logdir"]) model.cuda() log_interval = config['log_interval'] epochs = config['epochs'] model = nn.DataParallel(model) optimizer = optim.SGD(model.parameters(), lr=config['lr'], momentum=config['momentum']) scheduler = CosineAnnealingScheduler(optimizer, 'lr', 0.1, 0.001, len(train_loader)) loss_fn = nn.CrossEntropyLoss().cuda() trainer = create_supervised_trainer(model, optimizer, loss_fn, device='cuda') trainer.add_event_handler(Events.ITERATION_STARTED, scheduler) trainer_saver = ModelCheckpoint( config['checkpoint_dir'], filename_prefix="model_ckpt", save_interval=1000, n_saved=10, atomic=True, save_as_state_dict=True, create_dir=True ) trainer.add_event_handler(Events.ITERATION_COMPLETED, trainer_saver, { "model": model, }) evaluator = create_supervised_evaluator(model, metrics={"accuracy": Accuracy(), 'CE': Loss(loss_fn)}, device="cuda") desc = "ITERATION - loss: {:.2f}" pbar = tqdm( initial=0, leave=False, total=len(train_loader), desc=desc.format(0) ) @trainer.on(Events.ITERATION_COMPLETED) def log_training_loss(engine): iter = (engine.state.iteration - 1) % len(train_loader) + 1 if iter % log_interval == 0: pbar.desc = desc.format(engine.state.output) pbar.update(log_interval) writer.add_scalar("training/loss", engine.state.output, engine.state.iteration) @trainer.on(Events.EPOCH_COMPLETED) def log_training_results(engine): pbar.refresh() evaluator.run(train_loader) metrics = evaluator.state.metrics avg_accuracy = metrics['accuracy'] avg_CE = metrics['CE'] tqdm.write( "Training Results - Epoch: {} Avg accuracy: {:.2f} Avg loss: {:.2f}".format(engine.state.epoch, avg_accuracy, avg_CE) ) writer.add_scalar("training/avg_loss", avg_CE, engine.state.epoch) writer.add_scalar("training/avg_accuracy", avg_accuracy, engine.state.epoch) @trainer.on(Events.EPOCH_COMPLETED) def log_validation_results(engine): evaluator.run(val_loader) metrics = evaluator.state.metrics avg_accuracy = metrics['accuracy'] avg_CE = metrics['CE'] tqdm.write( "Validation Results - Epoch: {} Avg accuracy {:.2f} Avg loss: {:.2f}".format(engine.state.epoch, avg_accuracy, avg_CE) ) pbar.n = pbar.last_print_n = 0 writer.add_scalar("valdation/avg_loss", avg_CE, engine.state.epoch) writer.add_scalar("valdation/avg_accuracy", avg_accuracy, engine.state.epoch) trainer.run(train_loader, max_epochs=epochs) pbar.close() writer.close()