gamma=config['cyclic_lr_gamma']) # Update stats function to include the current learning rate def stats(policy, train_batch): return { 'policy_entropy': policy.entropy.item(), 'policy_loss': policy.pi_err.item(), 'vf_loss': policy.value_err.item(), 'cur_lr': policy._optimizers[0].param_groups[0]['lr'], } def get_policy_class(config): return TunedA2CPolicy TunedA2CPolicy = A3CTorchPolicy.with_updates( name='TunedA2CPolicy', get_default_config=lambda: TUNED_A2C_CONFIG, loss_fn=actor_critic_loss, stats_fn=stats, postprocess_fn=add_advantages, mixins=[ValueNetworkMixin], optimizer_fn=torch_optimizer) TunedA2CTrainer = A2CTrainer.with_updates(name='TunedA2C', default_config=TUNED_A2C_CONFIG, default_policy=TunedA2CPolicy, get_policy_class=get_policy_class)
return { 'policy_entropy': policy.entropy.item(), 'policy_loss': policy.pi_err.item(), 'manager_loss': policy.manager_loss.item(), 'manager_vf_loss': policy.manager_value_err.item(), 'worker_vf_loss': policy.worker_value_err.item(), 'cur_lr': policy._optimizers[0].param_groups[0]['lr'], 'fun_intrinsic_reward': train_batch['fun_intrinsic_reward'].mean().item() } def get_policy_class(config): return FuNPolicy FuNPolicy = A3CTorchPolicy.with_updates( name='FuNPolicy', get_default_config=lambda: FUN_CONFIG, extra_action_out_fn=model_extra_out, postprocess_fn=postprocesses_trajectories, loss_fn=actor_critic_loss, stats_fn=stats, mixins=[ValueNetworkMixin], optimizer_fn=torch_optimizer) FuNTrainer = A2CTrainer.with_updates(name='FuN', default_config=FUN_CONFIG, default_policy=FuNPolicy, get_policy_class=get_policy_class)
'manager_loss': policy.manager_loss.item(), 'manager_vf_loss': policy.manager_value_err.item(), 'worker_vf_loss': policy.worker_value_err.item(), 'cur_lr': policy._optimizers[0].param_groups[0]['lr'], 'fun_intrinsic_reward': train_batch['fun_intrinsic_reward'].mean().item(), 'icm_loss': policy.icm_loss.item(), 'exploration_rewards': train_batch['exploration_rewards'].mean().item(), } def get_policy_class(config): return WherPolicy WherPolicy = A3CTorchPolicy.with_updates( name='WherPolicy', get_default_config=lambda: WHER_CONFIG, extra_action_out_fn=model_extra_out, postprocess_fn=postprocesses_trajectories, loss_fn=actor_critic_loss, stats_fn=stats, mixins=[ValueNetworkMixin], optimizer_fn=torch_optimizer) WherTrainer = A2CTrainer.with_updates(name='Wher', default_config=WHER_CONFIG, default_policy=WherPolicy, get_policy_class=get_policy_class)