def sac_entropy_adjustment_train(env: gym.Env, run_params: RunParams, sac_params: SacEntropyAdjustmentParams): """ Trains the soft actor critic (SAC) on the given environment. Training is done at the end of each episode. Only continuous actions spaces are supported. Several features can be optionally enabled: 1) Scaling / normalizing the states / observations 2) Logging training statistics on Tensorboard 3) Render the environment periodically (pick render_frequency in the RunParams) 4) Testing the agent's performance periodically 5) Saving the policy and value estimators to disk periodically 6) During the first X steps, do random actions (see RunParams.num_random_action_steps) """ assert run_params.continuous_actions, "SAC implementation only implemented for continuous action spaces" print( f"The goal is a running reward of at least {env.spec.reward_threshold}." ) # Optimization for speed: don't compute gradients for the target networks, since we will never use them for network in [ sac_params.policy_target, sac_params.value_estimator1_target, sac_params.value_estimator2_target ]: for parameter in network.parameters(): parameter.requires_grad = False # Setup tensorboard writer = run_params.get_tensorboard_writer( env) if run_params.use_tensorboard else None # Setup scaler, training info and replay buffer scaler = setup_observation_scaler( env) if run_params.should_scale_states else None training_info = TrainingInfo(GAMMA=run_params.gamma) replay_buffer = ReplayBuffer(sac_params.replay_buffer_size) training_step_number, step_number, test_episode_num = 0, 0, 0 max_episode_steps = env.spec.max_episode_steps for episode_number in range(run_params.maximum_episodes): state = env.reset() episode_length = 0 # Update policy bounds # sac_params.policy.action_high = sac_params.policy_target.action_high = torch.tensor(env.action_space.high) # sac_params.policy.action_low = sac_params.policy_target.action_low = torch.tensor(env.action_space.low) # Do a whole episode for t in range(max_episode_steps): if run_params.should_scale_states: state = scale_state(scaler, state) # Pick an action, execute and observe the results # Note: in the first start_steps steps, we randomly pick actions from # the action space (uniformly) to have better exploration. if step_number >= sac_params.num_random_action_steps: action, log_prob = select_action_sac(state, sac_params, compute_log_prob=True) else: action = env.action_space.sample() log_prob = -1 # To be sure that actions are in the action space (see watershed.py) action = np.clip(action, env.action_space.low, env.action_space.high) # For debugging, log the Q-values if run_params.use_tensorboard: if random.random( ) < 0.02: # Don't log too often to avoid slowing things down s, a = torch.tensor(state).float(), torch.tensor( action).float() value1 = sac_params.value_estimator1.forward(s, a) value2 = sac_params.value_estimator2.forward(s, a) value1_target = sac_params.value_estimator1_target.forward( s, a) value2_target = sac_params.value_estimator2_target.forward( s, a) for action_index in range(a.shape[0]): writer.add_scalar(f"Action/{action_index}", a[action_index], step_number) writer.add_scalar("Q-values/Normal Network 1", value1, step_number) writer.add_scalar("Q-values/Normal Network 2", value2, step_number) writer.add_scalar("Q-values/Target Network 1", value1_target, step_number) writer.add_scalar("Q-values/Target Network 2", value2_target, step_number) writer.add_scalar("Action/Log prob action", log_prob, step_number) new_state, reward, done, _ = env.step(action) # Render the environment if wanted if run_params.should_render(episode_number): env.render() # Store reward and updates the running reward training_info.record_step(state, action, reward) # Add the transition to the replay buffer new_state_scaled = scale_state( scaler, new_state) if run_params.should_scale_states else new_state replay_buffer.store(state, action, reward, new_state_scaled, done and t < max_episode_steps - 1) state = new_state if done: break step_number += 1 episode_length += 1 # Training at the end of the episode approach taken from # https://github.com/createamind/DRL/blob/master/spinup/algos/sac1/sac1_BipedalWalker-v2_200ep.py for update_step in range(int(episode_length * 1.5)): batch_transitions = replay_buffer.sample_batch( sac_params.batch_size) update_models(batch_transitions, sac_params, run_params, writer, training_step_number) training_step_number += 1 if (episode_number + 0) % sac_params.test_frequency == 0: test_agent_performance(env, sac_params, run_params, writer, test_episode_num, scaler) test_episode_num += 1 if run_params.should_save_model(episode_number): save_model_sac(env, sac_params, scaler) training_info.update_running_reward(rate=0.01) # Add some logging log_on_console(env, episode_number, reward, run_params, t + 1, training_info) log_on_tensorboard(env, episode_number, reward, run_params, t + 1, training_info, writer) # Check if we have solved the environment reliably if run_params.stop_at_threshold and env.spec.reward_threshold is not None and training_info.running_reward > env.spec.reward_threshold: print( f"Solved! The running reward is {training_info.running_reward:.2f}, which is above the threshold of " f"{env.spec.reward_threshold}. The last episode ran for {t} steps." ) save_model_sac(env, sac_params, scaler) break training_info.reset() close_tensorboard(run_params, writer)
def actor_critic_train_per_episode( policy: SimplePolicyContinuous, critic: SimpleCritic, env: gym.Env, optimizer: Optimizer, run_params: RunParams, lr_scheduler: torch.optim.lr_scheduler._LRScheduler = None): """ Trains the actor critic on the given environment. Training is done at the end of each episode, instead of at the end of each step of an episode. This means the agent trains much more frequently. Both discrete and continuous actions spaces are supported. Several features can be optionally enabled: 1) Scaling / normalizing the states / observations 2) Logging training statistics on Tensorboard 3) Render the environment periodically (pick render_frequency in the RunParams) 4) Using a learning rate scheduler """ training_info = TrainingInfo(GAMMA=run_params.gamma) print( f"The goal is a running reward of at least {env.spec.reward_threshold}." ) # https://medium.com/@asteinbach/actor-critic-using-deep-rl-continuous-mountain-car-in-tensorflow-4c1fb2110f7c # says it's crucial to scale the state if run_params.should_scale_states: scaler = setup_observation_scaler(env) writer = run_params.get_tensorboard_writer( env) if run_params.use_tensorboard else None for episode_number in itertools.count( ): # itertools.count() is basically range(+infinity) state = env.reset() # Do a whole episode (upto 10000 steps, don't want infinite steps) for t in range(env.spec.max_episode_steps): if run_params.should_scale_states: state = scale_state(scaler, state) if run_params.continuous_actions: action = select_action_continuous(state, policy, training_info, env) else: action = select_action_discrete(state, policy, training_info) state_value = get_state_value(state, critic) new_state, reward, done, _ = env.step(action) if run_params.should_render(episode_number): env.render() training_info.record_step( state, action, reward, state_value) # Store reward and updates the running reward state = new_state if done: break training_info.update_running_reward() # Add some logging log_on_console(env, episode_number, reward, run_params, t, training_info) log_on_tensorboard(env, episode_number, reward, run_params, t, training_info, writer) # Check if we have solved the environment reliably if env.spec.reward_threshold is not None and training_info.running_reward > env.spec.reward_threshold: print( f"Solved! The running reward is {training_info.running_reward:.2f}, which is above the threshold of " f"{env.spec.reward_threshold}. The last episode ran for {t} steps." ) break train_policy_on_episode(optimizer, training_info, episode_number) if lr_scheduler: lr_scheduler.step(episode_number) close_tensorboard(run_params, writer)
def reinforceTraining(policy: SimplePolicyDiscrete, env: gym.Env, optimizer: Optimizer, run_params: RunParams): """ Trains the policy using the REINFORCE algorithm. Training is done at the end of each episode, and can be done either using all transitions at once, or over many training iterations over mini-batches of transitions. Both discrete and continuous actions spaces are supported. Several features can be optionally enabled: 1) Scaling / normalizing the states / observations 2) Logging training statistics on Tensorboard 3) Render the environment periodically (pick render_frequency in the RunParams) 4) Save the policy (and optionally the observation / state scaler) periodically (see RunParams.save_model_frequency) """ training_info = TrainingInfo(GAMMA=run_params.gamma) print( f"The goal is a running reward of at least {env.spec.reward_threshold}." ) scaler = setup_observation_scaler( env) if run_params.should_scale_states else None writer = run_params.get_tensorboard_writer( env) if run_params.use_tensorboard else None for episode_number in itertools.count( ): # itertools.count() is basically range(+infinity) state = env.reset() # Do a whole episode (upto 10000 steps, don't want infinite steps) for t in range(env.spec.max_episode_steps): if run_params.should_scale_states: state = scale_state(scaler, state) if run_params.continuous_actions: action = select_action_continuous(state, policy, training_info, env) else: action = select_action_discrete(state, policy, training_info) new_state, reward, done, _ = env.step(action) if run_params.should_render(episode_number): env.render() training_info.record_step( state, action, reward) # Store reward and updates the running reward state = new_state if done: break training_info.update_running_reward() # Add some logging log_on_console(env, episode_number, reward, run_params, t, training_info) log_on_tensorboard(env, episode_number, reward, run_params, t, training_info, writer) # Check if we have solved the environment reliably if env.spec.reward_threshold is not None and training_info.running_reward > env.spec.reward_threshold: print( f"Solved! The running reward is {training_info.running_reward:.2f}, which is above the threshold of " f"{env.spec.reward_threshold}. The last episode ran for {t} steps." ) break if run_params.train_with_batches: train_policy_batches(policy, optimizer, training_info, run_params) else: train_policy(optimizer, training_info, run_params) if run_params.should_save_model(episode_number): save_model(policy, env, "policy.data") if scaler is not None: save_scaler(scaler, env, "scaler.data") close_tensorboard(run_params, writer)
def ddpg_train(env: gym.Env, run_params: RunParams, ddpg_params: DDPGParams): """ :param env: the OpenAI gym environment :param run_params: the general training parameters shared by all training algorithm :param ddpg_params: the DDPG-specific information (networks, optimizers, parameters) """ assert run_params.continuous_actions, "DDPG implementation only implemented for continuous action spaces" print(f"The goal is a running reward of at least {env.spec.reward_threshold}.") # Optimization for speed: don't compute gradients for the target networks, since we will never use them for network in [ddpg_params.policy_target, ddpg_params.value_estimator_target]: for parameter in network.parameters(): parameter.requires_grad = False # Setup tensorboard writer = run_params.get_tensorboard_writer(env) if run_params.use_tensorboard else None # Setup scaler, training info and replay buffer scaler = setup_observation_scaler(env) if run_params.should_scale_states else None training_info = TrainingInfo(GAMMA=run_params.gamma) replay_buffer = ReplayBuffer(ddpg_params.replay_buffer_size) step_number, test_episode_num = 0, 0 max_episode_steps = env.spec.max_episode_steps value_time_step = 0 for episode_number in range(run_params.maximum_episodes): state = env.reset() # Do a whole episode for t in range(max_episode_steps): if run_params.should_scale_states: state = scale_state(scaler, state) # Pick an action, execute and observe the results # Note: in the first start_steps steps, we randomly pick actions from # the action space (uniformly) to have better exploration. if step_number >= ddpg_params.num_random_action_steps: action = select_action_ddpg(state, ddpg_params, env, ddpg_params.noise_coeff * 0.995 ** episode_number) else: action = env.action_space.sample() # For debugging, log the Q-values if run_params.use_tensorboard: s, a = torch.tensor(state).float(), torch.tensor(action).float() value = ddpg_params.value_estimator.forward(s, a) value_target = ddpg_params.value_estimator_target.forward(s, a) for action_index in range(a.shape[0]): writer.add_scalar(f"Action/{action_index}", a[action_index], value_time_step) writer.add_scalar("Q-values/Normal Network", value, value_time_step) writer.add_scalar("Q-values/Target Network", value_target, value_time_step) value_time_step += 1 new_state, reward, done, _ = env.step(action) # Render the environment if wanted if run_params.should_render(episode_number): env.render() # Store reward and updates the running reward training_info.record_step(state, action, reward) # Add the transition to the replay buffer new_state_scaled = scale_state(scaler, new_state) if run_params.should_scale_states else new_state replay_buffer.store(state, action, reward, new_state_scaled, done and t < max_episode_steps - 1) state = new_state if done: break if step_number >= ddpg_params.update_start and step_number % ddpg_params.update_frequency == 0: for update_step in range(ddpg_params.update_frequency): batch_transitions = replay_buffer.sample_batch(ddpg_params.batch_size) update_models(batch_transitions, ddpg_params, run_params, writer, step_number) step_number += 1 if episode_number % ddpg_params.test_frequency == 0: test_agent_performance(env, ddpg_params, run_params, writer, test_episode_num, scaler) test_episode_num += 1 if run_params.should_save_model(episode_number): save_model_ddpg(ddpg_params, env, scaler) training_info.update_running_reward() # Add some logging log_on_console(env, episode_number, reward, run_params, t, training_info) log_on_tensorboard(env, episode_number, reward, run_params, t, training_info, writer) # Check if we have solved the environment reliably if run_params.stop_at_threshold and env.spec.reward_threshold is not None and training_info.running_reward > env.spec.reward_threshold: print(f"Solved! The running reward is {training_info.running_reward:.2f}, which is above the threshold of " f"{env.spec.reward_threshold}. The last episode ran for {t} steps.") break training_info.reset() ddpg_params.noise_source.reset() close_tensorboard(run_params, writer)