def evaluate(self, policy: TorchPolicy) -> None: """Evaluate given policy (results are stored in stat logs) and dump the model if the reward improved. :param policy: Policy to evaluate. """ policy.eval() with torch.no_grad(): total_loss = [] for iteration, data in enumerate(self.data_loader, 0): observations, actions, actor_ids = data[0], data[1], data[-1] actor_ids = debatch_actor_ids(actor_ids) # Convert only actions to torch, since observations are converted in # policy.compute_substep_policy_output method convert_to_torch(actions, device=policy.device, cast=None, in_place=True) total_loss.append( self.loss.calculate_loss(policy=policy, observations=observations, actions=actions, events=self.eval_events, actor_ids=actor_ids).item()) if self.model_selection: self.model_selection.update(-np.mean(total_loss).item())
def evaluate(self, policy: TorchPolicy) -> None: """Evaluate given policy (results are stored in stat logs) and dump the model if the reward improved. :param policy: Policy to evaluate """ policy.eval() n_done_episodes = 0 observations = self.eval_env.reset() # Clear epoch stats in case there were some unfinished episodes from the previous evaluation round self.eval_env.clear_epoch_stats() while n_done_episodes < self.n_episodes: # Sample action and take the step sampled_action = policy.compute_action( observations, actor_id=self.eval_env.actor_id(), maze_state=None, deterministic=self.deterministic) observations, rewards, dones, infos = self.eval_env.step( sampled_action) # Count done episodes n_done_episodes += np.count_nonzero(dones) # Enforce the epoch stats calculation (without calling increment_log_step() -- this is up to the trainer) self.eval_env.write_epoch_stats() # Notify the model selection if available if self.model_selection: reward = self.eval_env.get_stats_value(BaseEnvEvents.reward, LogStatsLevel.EPOCH, name="mean") self.model_selection.update(reward)
def policy(self) -> Optional[TorchPolicy]: """Implementation of the BaseModelComposer interface, returns the policy networks.""" if self._policy_type is None: return None elif issubclass(self._policy_type, ProbabilisticPolicyComposer): networks = dict() for sub_step_key in self.action_spaces_dict.keys(): networks[sub_step_key], self._shared_embedding_nets[ sub_step_key] = self.template_policy_net( observation_space=self. observation_spaces_dict[sub_step_key], action_space=self.action_spaces_dict[sub_step_key], shared_embedding_keys=self.model_builder. shared_embedding_keys[sub_step_key]) return TorchPolicy(networks=networks, distribution_mapper=self.distribution_mapper, device="cpu") else: raise ValueError( f"Policy type {self._policy_type} not supported by the template model composer!" )
def _policy(env: GymMazeEnv): distribution_mapper = DistributionMapper(action_space=env.action_space, distribution_mapper_config={}) policies = { 0: FlattenConcatPolicyNet({'observation': (4, )}, {'action': (2, )}, hidden_units=[16], non_lin=nn.Tanh) } critics = { 0: FlattenConcatStateValueNet({'observation': (4, )}, hidden_units=[16], non_lin=nn.Tanh) } policy = TorchPolicy(networks=policies, distribution_mapper=distribution_mapper, device="cpu") critic = TorchSharedStateCritic( networks=critics, obs_spaces_dict=env.observation_spaces_dict, device="cpu", stack_observations=False) return TorchActorCritic(policy=policy, critic=critic, device="cpu")
def policy(self) -> TorchPolicy: """implementation of :class:`~maze.perception.models.policies.base_policy_composer.BasePolicyComposer` """ return TorchPolicy(networks=self._policies, distribution_mapper=self._distribution_mapper, device='cpu', substeps_with_separate_agent_nets=self. _substeps_with_separate_agent_nets)
def setup(self, cfg: DictConfig) -> None: """ Setup the training master node. """ super().setup(cfg) # --- init the shared noise table --- print("********** Init Shared Noise Table **********") self.shared_noise = SharedNoiseTable( count=self.shared_noise_table_size) # --- initialize policies --- torch_policy = TorchPolicy( networks=self._model_composer.policy.networks, distribution_mapper=self._model_composer.distribution_mapper, device="cpu") torch_policy.seed(self.maze_seeding.agent_global_seed) # support policy wrapping if self._cfg.algorithm.policy_wrapper: policy = Factory(Policy).instantiate( self._cfg.algorithm.policy_wrapper, torch_policy=torch_policy) assert isinstance(policy, Policy) and isinstance( policy, TorchModel) torch_policy = policy print("********** Trainer Setup **********") self._trainer = ESTrainer( algorithm_config=cfg.algorithm, torch_policy=torch_policy, shared_noise=self.shared_noise, normalization_stats=self._normalization_statistics) # initialize model from input_dir self._init_trainer_from_input_dir( trainer=self._trainer, state_dict_dump_file=self.state_dict_dump_file, input_dir=cfg.input_dir) self._model_selection = BestModelSelection( dump_file=self.state_dict_dump_file, model=torch_policy, dump_interval=self.dump_interval)
def main(n_epochs) -> None: """Trains the cart pole environment with the ES implementation. """ env = GymMazeEnv(env="CartPole-v0") distribution_mapper = DistributionMapper(action_space=env.action_space, distribution_mapper_config={}) obs_shapes = observation_spaces_to_in_shapes(env.observation_spaces_dict) action_shapes = { step_key: { action_head: distribution_mapper.required_logits_shape(action_head) for action_head in env.action_spaces_dict[step_key].spaces.keys() } for step_key in env.action_spaces_dict.keys() } # initialize policies policies = [ PolicyNet(obs_shapes=obs_shapes[0], action_logits_shapes=action_shapes[0], non_lin=nn.SELU) ] # initialize optimizer policy = TorchPolicy(networks=list_to_dict(policies), distribution_mapper=distribution_mapper, device="cpu") shared_noise = SharedNoiseTable(count=1_000_000) algorithm_config = ESAlgorithmConfig(n_rollouts_per_update=100, n_timesteps_per_update=0, max_steps=0, optimizer=Adam(step_size=0.01), l2_penalty=0.005, noise_stddev=0.02, n_epochs=n_epochs, policy_wrapper=None) trainer = ESTrainer(algorithm_config=algorithm_config, torch_policy=policy, shared_noise=shared_noise, normalization_stats=None) setup_logging(job_config=None) maze_rng = np.random.RandomState(None) # run with pseudo-distribution, without worker processes trainer.train(ESDummyDistributedRollouts( env=env, n_eval_rollouts=10, shared_noise=shared_noise, agent_instance_seed=MazeSeeding.generate_seed_from_random_state( maze_rng)), model_selection=None)
def train_function(n_epochs: int, distributed_env_cls) -> A2C: """Trains the cart pole environment with the multi-step a2c implementation. """ # initialize distributed env envs = distributed_env_cls([lambda: GymMazeEnv(env="CartPole-v0") for _ in range(2)]) # initialize the env and enable statistics collection eval_env = distributed_env_cls([lambda: GymMazeEnv(env="CartPole-v0") for _ in range(2)], logging_prefix='eval') # init distribution mapper env = GymMazeEnv(env="CartPole-v0") distribution_mapper = DistributionMapper(action_space=env.action_space, distribution_mapper_config={}) # initialize policies policies = {0: FlattenConcatPolicyNet({'observation': (4,)}, {'action': (2,)}, hidden_units=[16], non_lin=nn.Tanh)} # initialize critic critics = {0: FlattenConcatStateValueNet({'observation': (4,)}, hidden_units=[16], non_lin=nn.Tanh)} # algorithm configuration algorithm_config = A2CAlgorithmConfig( n_epochs=n_epochs, epoch_length=2, patience=10, critic_burn_in_epochs=0, n_rollout_steps=20, lr=0.0005, gamma=0.98, gae_lambda=1.0, policy_loss_coef=1.0, value_loss_coef=0.5, entropy_coef=0.0, max_grad_norm=0.0, device="cpu", rollout_evaluator=RolloutEvaluator(eval_env=eval_env, n_episodes=1, model_selection=None, deterministic=True) ) # initialize actor critic model model = TorchActorCritic( policy=TorchPolicy(networks=policies, distribution_mapper=distribution_mapper, device=algorithm_config.device), critic=TorchSharedStateCritic(networks=critics, obs_spaces_dict=env.observation_spaces_dict, device=algorithm_config.device, stack_observations=False), device=algorithm_config.device) a2c = A2C(rollout_generator=RolloutGenerator(envs), algorithm_config=algorithm_config, evaluator=algorithm_config.rollout_evaluator, model=model, model_selection=None) # train agent a2c.train() return a2c
def train_setup( n_epochs: int, policy_wrapper=None) -> Tuple[TorchPolicy, StructuredEnv, ESTrainer]: """Trains the cart pole environment with the multi-step a2c implementation. """ # initialize distributed env env = GymMazeEnv(env="CartPole-v0") # initialize distribution mapper distribution_mapper = DistributionMapper(action_space=env.action_space, distribution_mapper_config={}) # initialize policies policies = { 0: FlattenConcatPolicyNet({'observation': (4, )}, {'action': (2, )}, hidden_units=[16], non_lin=nn.Tanh) } # initialize optimizer policy = TorchPolicy(networks=policies, distribution_mapper=distribution_mapper, device="cpu") # reduce the noise table size to speed up testing shared_noise = SharedNoiseTable(count=1_000_000) algorithm_config = ESAlgorithmConfig(n_rollouts_per_update=100, n_timesteps_per_update=0, max_steps=0, optimizer=Adam(step_size=0.01), l2_penalty=0.005, noise_stddev=0.02, n_epochs=n_epochs, policy_wrapper=policy_wrapper) # train agent trainer = ESTrainer(algorithm_config=algorithm_config, shared_noise=shared_noise, torch_policy=policy, normalization_stats=None) return policy, env, trainer
def calculate_loss(self, policy: TorchPolicy, observations: List[ObservationType], actions: List[TorchActionType], actor_ids: List[ActorID], events: ImitationEvents) -> torch.Tensor: """Calculate and return the training loss for one step (= multiple sub-steps in structured scenarios). :param policy: Structured policy to evaluate. :param observations: List with observations w.r.t. actor_ids. :param actions: List with actions w.r.t. actor_ids. :param actor_ids: List of actor ids. :param events: Events of current episode. :return: Total loss """ losses = [] # Iterate over all sub-steps assert len(actor_ids) == len(actions) assert len(actor_ids) == len(observations) for actor_id, observation, target_action in zip( actor_ids, observations, actions): policy_output = policy.compute_substep_policy_output( observation, actor_id=actor_id) substep_losses = self._get_substep_loss( actor_id, policy_output.action_logits, target_action, self.action_spaces_dict[actor_id.step_key], events=events) losses.append(substep_losses) # Compute and report policy entropy entropy = policy_output.entropy.mean() events.policy_entropy(step_id=actor_id.step_key, agent_id=actor_id.agent_id, value=entropy.item()) if self.entropy_coef > 0: losses.append(-self.entropy_coef * entropy) return sum(losses)
def setup(self, cfg: DictConfig) -> None: """ See :py:meth:`~maze.train.trainers.common.training_runner.TrainingRunner.setup`. """ super().setup(cfg) env = self.env_factory() with SwitchWorkingDirectoryToInput(cfg.input_dir): dataset = Factory(base_type=Dataset).instantiate( self.dataset, conversion_env_factory=self.env_factory) assert len(dataset) > 0, f"Expected to find trajectory data, but did not find any. Please check that " \ f"the path you supplied is correct." size_in_byte, size_in_gbyte = getsize(dataset) BColors.print_colored( f'Size of loaded dataset: {size_in_byte} -> {size_in_gbyte} GB', BColors.OKBLUE) validation, train = self._split_dataset( dataset, cfg.algorithm.validation_percentage, self.maze_seeding.generate_env_instance_seed()) # Create data loaders torch_generator = torch.Generator().manual_seed( self.maze_seeding.generate_env_instance_seed()) train_data_loader = DataLoader(train, shuffle=True, batch_size=cfg.algorithm.batch_size, generator=torch_generator, num_workers=self.dataset.n_workers) policy = TorchPolicy( networks=self._model_composer.policy.networks, distribution_mapper=self._model_composer.distribution_mapper, device=cfg.algorithm.device, substeps_with_separate_agent_nets=self._model_composer.policy. substeps_with_separate_agent_nets) policy.seed(self.maze_seeding.agent_global_seed) self._model_selection = BestModelSelection( self.state_dict_dump_file, policy, dump_interval=self.dump_interval) optimizer = Factory(Optimizer).instantiate(cfg.algorithm.optimizer, params=policy.parameters()) loss = BCLoss(action_spaces_dict=env.action_spaces_dict, entropy_coef=cfg.algorithm.entropy_coef) self._trainer = BCTrainer(algorithm_config=self._cfg.algorithm, data_loader=train_data_loader, policy=policy, optimizer=optimizer, loss=loss) # initialize model from input_dir self._init_trainer_from_input_dir( trainer=self._trainer, state_dict_dump_file=self.state_dict_dump_file, input_dir=cfg.input_dir) # evaluate using the validation set self.evaluators = [] if len(validation) > 0: validation_data_loader = DataLoader( validation, shuffle=True, batch_size=cfg.algorithm.batch_size, generator=torch_generator, num_workers=self.dataset.n_workers) self.evaluators += [ BCValidationEvaluator( data_loader=validation_data_loader, loss=loss, logging_prefix="eval-validation", model_selection=self. _model_selection # use the validation set evaluation to select the best model ) ] # if evaluation episodes are set, perform additional evaluation by policy rollout if cfg.algorithm.n_eval_episodes > 0: eval_env = self.create_distributed_eval_env( self.env_factory, self.eval_concurrency, logging_prefix="eval-rollout") eval_env_instance_seeds = [ self.maze_seeding.generate_env_instance_seed() for _ in range(self.eval_concurrency) ] eval_env.seed(eval_env_instance_seeds) self.evaluators += [ RolloutEvaluator(eval_env, n_episodes=cfg.algorithm.n_eval_episodes, model_selection=None) ]
def _get_cartpole_setup_components( ) -> Tuple[CustomModelComposer, ProbabilisticPolicyComposer, SharedStateCriticComposer, TorchPolicy, TorchActorCritic]: """ Returns various instantiated components for environment CartPole-v0. :return: Various components cartpole setting. """ env = GymMazeEnv(env=gym.make("CartPole-v0")) observation_space = env.observation_space action_space = env.action_space policy_net = FlattenConcatPolicyNet({'observation': (4, )}, {'action': (2, )}, hidden_units=[16], non_lin=nn.Tanh) maze_wrapped_policy_net = TorchModelBlock( in_keys='observation', out_keys='action', in_shapes=observation_space.spaces['observation'].shape, in_num_dims=[2], out_num_dims=2, net=policy_net) policy_networks = {0: maze_wrapped_policy_net} # Policy Distribution # ^^^^^^^^^^^^^^^^^^^ distribution_mapper = DistributionMapper(action_space=action_space, distribution_mapper_config={}) # Instantiating the Policy # ^^^^^^^^^^^^^^^^^^^^^^^^ torch_policy = TorchPolicy(networks=policy_networks, distribution_mapper=distribution_mapper, device='cpu') policy_composer = ProbabilisticPolicyComposer( action_spaces_dict=env.action_spaces_dict, observation_spaces_dict=env.observation_spaces_dict, distribution_mapper=distribution_mapper, networks=[{ '_target_': 'maze.perception.models.built_in.flatten_concat.FlattenConcatPolicyNet', 'non_lin': 'torch.nn.Tanh', 'hidden_units': [222, 222] }], substeps_with_separate_agent_nets=[], agent_counts_dict={0: 1}) # Value Function Setup # -------------------- # Value Network # ^^^^^^^^^^^^^ value_net = FlattenConcatStateValueNet({'observation': (4, )}, hidden_units=[16], non_lin=nn.Tanh) maze_wrapped_value_net = TorchModelBlock( in_keys='observation', out_keys='value', in_shapes=observation_space.spaces['observation'].shape, in_num_dims=[2], out_num_dims=2, net=value_net) value_networks = {0: maze_wrapped_value_net} # Instantiate the Value Function # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ torch_critic = TorchSharedStateCritic( networks=value_networks, obs_spaces_dict=env.observation_spaces_dict, device='cpu', stack_observations=True) # Critic composer. critic_composer = SharedStateCriticComposer( observation_spaces_dict=env.observation_spaces_dict, agent_counts_dict={0: 1}, networks=value_networks, stack_observations=True) # Initializing the ActorCritic Model. # ----------------------------------- actor_critic_model = TorchActorCritic(policy=torch_policy, critic=torch_critic, device='cpu') model_composer = CustomModelComposer( action_spaces_dict=env.action_spaces_dict, observation_spaces_dict=env.observation_spaces_dict, distribution_mapper_config={}, policy=policy_composer, critic=None, agent_counts_dict={0: 1}) return model_composer, policy_composer, critic_composer, torch_policy, actor_critic_model
def train_function(n_epochs: int, epoch_length: int, deterministic_eval: bool, eval_repeats: int, distributed_env_cls, split_rollouts_into_transitions: bool) -> SAC: """Implements the lunar lander continuous env and performs tests on it w.r.t. the sac trainer. """ # initialize distributed env env_factory = lambda: GymMazeEnv(env="LunarLanderContinuous-v2") # initialize the env and enable statistics collection eval_env = distributed_env_cls([env_factory for _ in range(2)], logging_prefix='eval') env = env_factory() # init distribution mapper distribution_mapper = DistributionMapper( action_space=env.action_space, distribution_mapper_config=[{ 'action_space': 'gym.spaces.Box', 'distribution': 'maze.distributions.squashed_gaussian.SquashedGaussianProbabilityDistribution' }]) action_shapes = { step_key: { action_head: tuple(distribution_mapper.required_logits_shape(action_head)) for action_head in env.action_spaces_dict[step_key].spaces.keys() } for step_key in env.action_spaces_dict.keys() } obs_shapes = observation_spaces_to_in_shapes(env.observation_spaces_dict) # initialize policies policies = { ii: PolicyNet(obs_shapes=obs_shapes[ii], action_logits_shapes=action_shapes[ii], non_lin=nn.Tanh) for ii in obs_shapes.keys() } for key, value in env.action_spaces_dict.items(): for act_key, act_space in value.spaces.items(): obs_shapes[key][act_key] = act_space.sample().shape # initialize critic critics = { ii: QCriticNetContinuous(obs_shapes[ii], non_lin=nn.Tanh, action_spaces_dict=env.action_spaces_dict) for ii in obs_shapes.keys() } # initialize optimizer algorithm_config = SACAlgorithmConfig( n_rollout_steps=5, lr=0.001, entropy_coef=0.2, gamma=0.99, max_grad_norm=0.5, batch_size=100, num_actors=2, tau=0.005, target_update_interval=1, entropy_tuning=False, device='cpu', replay_buffer_size=10000, initial_buffer_size=100, initial_sampling_policy={ '_target_': 'maze.core.agent.random_policy.RandomPolicy' }, rollouts_per_iteration=1, split_rollouts_into_transitions=split_rollouts_into_transitions, entropy_coef_lr=0.0007, num_batches_per_iter=1, n_epochs=n_epochs, epoch_length=epoch_length, rollout_evaluator=RolloutEvaluator(eval_env=eval_env, n_episodes=eval_repeats, model_selection=None, deterministic=deterministic_eval), patience=50, target_entropy_multiplier=1.0) actor_policy = TorchPolicy(networks=policies, distribution_mapper=distribution_mapper, device='cpu') replay_buffer = UniformReplayBuffer( buffer_size=algorithm_config.replay_buffer_size, seed=1234) SACRunner.init_replay_buffer( replay_buffer=replay_buffer, initial_sampling_policy=algorithm_config.initial_sampling_policy, initial_buffer_size=algorithm_config.initial_buffer_size, replay_buffer_seed=1234, split_rollouts_into_transitions=split_rollouts_into_transitions, n_rollout_steps=algorithm_config.n_rollout_steps, env_factory=env_factory) distributed_actors = DummyDistributedWorkersWithBuffer( env_factory=env_factory, worker_policy=actor_policy, n_rollout_steps=algorithm_config.n_rollout_steps, n_workers=algorithm_config.num_actors, batch_size=algorithm_config.batch_size, rollouts_per_iteration=algorithm_config.rollouts_per_iteration, split_rollouts_into_transitions=split_rollouts_into_transitions, env_instance_seeds=list(range(algorithm_config.num_actors)), replay_buffer=replay_buffer) critics_policy = TorchStepStateActionCritic( networks=critics, num_policies=1, device='cpu', only_discrete_spaces={0: False}, action_spaces_dict=env.action_spaces_dict) learner_model = TorchActorCritic(policy=actor_policy, critic=critics_policy, device='cpu') # initialize trainer sac = SAC(learner_model=learner_model, distributed_actors=distributed_actors, algorithm_config=algorithm_config, evaluator=algorithm_config.rollout_evaluator, model_selection=None) # train agent sac.train(n_epochs=algorithm_config.n_epochs) return sac
def main(n_epochs: int, rnn_steps: int) -> None: """Trains the cart pole environment with the multi-step a2c implementation. """ env_name = "CartPole-v0" # initialize distributed env envs = SequentialVectorEnv([ lambda: to_rnn_dict_space_environment(env=env_name, rnn_steps=rnn_steps) for _ in range(4) ], logging_prefix="train") # initialize the env and enable statistics collection eval_env = SequentialVectorEnv([ lambda: to_rnn_dict_space_environment(env=env_name, rnn_steps=rnn_steps) for _ in range(4) ], logging_prefix="eval") # map observations to a modality obs_modalities_mappings = {"observation": "feature"} # define how to process a modality modality_config = dict() modality_config["feature"] = { "block_type": "maze.perception.blocks.DenseBlock", "block_params": { "hidden_units": [32, 32], "non_lin": "torch.nn.Tanh" } } modality_config["hidden"] = { "block_type": "maze.perception.blocks.DenseBlock", "block_params": { "hidden_units": [64], "non_lin": "torch.nn.Tanh" } } modality_config["recurrence"] = {} if rnn_steps > 0: modality_config["recurrence"] = { "block_type": "maze.perception.blocks.LSTMLastStepBlock", "block_params": { "hidden_size": 8, "num_layers": 1, "bidirectional": False, "non_lin": "torch.nn.Tanh" } } template_builder = TemplateModelComposer( action_spaces_dict=envs.action_spaces_dict, observation_spaces_dict=envs.observation_spaces_dict, agent_counts_dict=envs.agent_counts_dict, distribution_mapper_config={}, model_builder=ConcatModelBuilder(modality_config, obs_modalities_mappings, None), policy={ '_target_': 'maze.perception.models.policies.ProbabilisticPolicyComposer' }, critic={ '_target_': 'maze.perception.models.critics.StateCriticComposer' }) algorithm_config = A2CAlgorithmConfig(n_epochs=n_epochs, epoch_length=10, patience=10, critic_burn_in_epochs=0, n_rollout_steps=20, lr=0.0005, gamma=0.98, gae_lambda=1.0, policy_loss_coef=1.0, value_loss_coef=0.5, entropy_coef=0.0, max_grad_norm=0.0, device="cpu", rollout_evaluator=RolloutEvaluator( eval_env=eval_env, n_episodes=1, model_selection=None, deterministic=True)) model = TorchActorCritic(policy=TorchPolicy( networks=template_builder.policy.networks, distribution_mapper=template_builder.distribution_mapper, device=algorithm_config.device), critic=template_builder.critic, device=algorithm_config.device) a2c = A2C(rollout_generator=RolloutGenerator(envs), evaluator=algorithm_config.rollout_evaluator, algorithm_config=algorithm_config, model=model, model_selection=None) setup_logging(job_config=None) # train agent a2c.train() # final evaluation run print("Final Evaluation Run:") a2c.evaluate()
def main(n_epochs: int) -> None: """Trains the cart pole environment with the multi-step a2c implementation. """ # initialize distributed env envs = SequentialVectorEnv( [lambda: GymMazeEnv(env="CartPole-v0") for _ in range(8)], logging_prefix="train") # initialize the env and enable statistics collection eval_env = SequentialVectorEnv( [lambda: GymMazeEnv(env="CartPole-v0") for _ in range(8)], logging_prefix="eval") # init distribution mapper env = GymMazeEnv(env="CartPole-v0") # init default distribution mapper distribution_mapper = DistributionMapper(action_space=env.action_space, distribution_mapper_config={}) # initialize policies policies = { 0: PolicyNet({'observation': (4, )}, {'action': (2, )}, non_lin=nn.Tanh) } # initialize critic critics = {0: ValueNet({'observation': (4, )})} # initialize optimizer algorithm_config = A2CAlgorithmConfig(n_epochs=n_epochs, epoch_length=10, patience=10, critic_burn_in_epochs=0, n_rollout_steps=20, lr=0.0005, gamma=0.98, gae_lambda=1.0, policy_loss_coef=1.0, value_loss_coef=0.5, entropy_coef=0.0, max_grad_norm=0.0, device="cpu", rollout_evaluator=RolloutEvaluator( eval_env=eval_env, n_episodes=1, model_selection=None, deterministic=True)) # initialize actor critic model model = TorchActorCritic(policy=TorchPolicy( networks=policies, distribution_mapper=distribution_mapper, device=algorithm_config.device), critic=TorchSharedStateCritic( networks=critics, obs_spaces_dict=env.observation_spaces_dict, device=algorithm_config.device, stack_observations=False), device=algorithm_config.device) a2c = A2C(rollout_generator=RolloutGenerator(envs), evaluator=algorithm_config.rollout_evaluator, algorithm_config=algorithm_config, model=model, model_selection=None) setup_logging(job_config=None) # train agent a2c.train() # final evaluation run print("Final Evaluation Run:") a2c.evaluate()
def train(n_epochs): # Instantiate one environment. This will be used for convenient access to observation # and action spaces. env = cartpole_env_factory() observation_space = env.observation_space action_space = env.action_space # Policy Setup # ------------ # Policy Network # ^^^^^^^^^^^^^^ # Instantiate policy with the correct shapes of observation and action spaces. policy_net = CartpolePolicyNet( obs_shapes={'observation': observation_space.spaces['observation'].shape}, action_logit_shapes={'action': (action_space.spaces['action'].n,)}) maze_wrapped_policy_net = TorchModelBlock( in_keys='observation', out_keys='action', in_shapes=observation_space.spaces['observation'].shape, in_num_dims=[2], out_num_dims=2, net=policy_net) policy_networks = {0: maze_wrapped_policy_net} # Policy Distribution # ^^^^^^^^^^^^^^^^^^^ distribution_mapper = DistributionMapper( action_space=action_space, distribution_mapper_config={}) # Optionally, you can specify a different distribution with the distribution_mapper_config argument. Using a # Categorical distribution for a discrete action space would be done via distribution_mapper = DistributionMapper( action_space=action_space, distribution_mapper_config=[{ "action_space": gym.spaces.Discrete, "distribution": "maze.distributions.categorical.CategoricalProbabilityDistribution"}]) # Instantiating the Policy # ^^^^^^^^^^^^^^^^^^^^^^^^ torch_policy = TorchPolicy(networks=policy_networks, distribution_mapper=distribution_mapper, device='cpu') # Value Function Setup # -------------------- # Value Network # ^^^^^^^^^^^^^ value_net = CartpoleValueNet(obs_shapes={'observation': observation_space.spaces['observation'].shape}) maze_wrapped_value_net = TorchModelBlock( in_keys='observation', out_keys='value', in_shapes=observation_space.spaces['observation'].shape, in_num_dims=[2], out_num_dims=2, net=value_net) value_networks = {0: maze_wrapped_value_net} # Instantiate the Value Function # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ torch_critic = TorchSharedStateCritic(networks=value_networks, obs_spaces_dict=env.observation_spaces_dict, device='cpu', stack_observations=False) # Initializing the ActorCritic Model. # ----------------------------------- actor_critic_model = TorchActorCritic(policy=torch_policy, critic=torch_critic, device='cpu') # Instantiating the Trainer # ========================= algorithm_config = A2CAlgorithmConfig( n_epochs=n_epochs, epoch_length=25, patience=15, critic_burn_in_epochs=0, n_rollout_steps=100, lr=0.0005, gamma=0.98, gae_lambda=1.0, policy_loss_coef=1.0, value_loss_coef=0.5, entropy_coef=0.00025, max_grad_norm=0.0, device='cpu', rollout_evaluator=RolloutEvaluator( eval_env=SequentialVectorEnv([cartpole_env_factory]), n_episodes=1, model_selection=None, deterministic=True ) ) # Distributed Environments # ------------------------ # In order to use the distributed trainers, the previously created env factory is supplied to one of Maze's # distribution classes: train_envs = SequentialVectorEnv([cartpole_env_factory for _ in range(2)], logging_prefix="train") eval_envs = SequentialVectorEnv([cartpole_env_factory for _ in range(2)], logging_prefix="eval") # Initialize best model selection. model_selection = BestModelSelection(dump_file="params.pt", model=actor_critic_model) a2c_trainer = A2C(rollout_generator=RolloutGenerator(train_envs), evaluator=algorithm_config.rollout_evaluator, algorithm_config=algorithm_config, model=actor_critic_model, model_selection=model_selection) # Train the Agent # =============== # Before starting the training, we will enable logging by calling log_dir = '.' setup_logging(job_config=None, log_dir=log_dir) # Now, we can train the agent. a2c_trainer.train() return 0