def train_forward_dynamics(self): if self.replay_buffer.get_buffer_size() < self.batch_size: return None transitions = self.replay_buffer.sample_batch(self.batch_size) batch = Buffer.Transition(*zip(*transitions)) # Get the separate values from the named tuple states = batch.state new_states = batch.next_state actions = batch.action rewards = batch.reward dones = batch.done states = Variable(torch.cat(states)) new_states = Variable(torch.cat(new_states), volatile=True) actions = Variable(torch.cat(actions)) rewards = Variable(torch.cat(rewards)) dones = Variable(torch.cat(dones)) if self.use_cuda: states = states.cuda() actions = actions.cuda() rewards = rewards.cuda() new_states = new_states.cuda() dones = dones.cuda() predicted_new_states = self.fwd(states, actions) mse_error = F.mse_loss(predicted_new_states, new_states) self.fwd_optim.zero_grad() mse_error.backward() self.fwd_optim.step() return mse_error
def train_policy(self, clip_gradients=True): # Sample mini-batch from the replay buffer uniformly or from the prioritized experience replay. # If the size of the buffer is less than batch size then return if self.dqn_replay_buffer.get_buffer_size() < self.batch_size: return None transitions = self.dqn_replay_buffer.sample_batch(self.batch_size) batch = Buffer.Transition(*zip(*transitions)) # Get the separate values from the named tuple states = batch.state new_states = batch.next_state actions = batch.action rewards = batch.reward dones = batch.done states = Variable(torch.cat(states)) new_states = Variable(torch.cat(new_states), requires_grad=False) actions = Variable(torch.cat(actions)) rewards = Variable(torch.cat(rewards)) dones = Variable(torch.cat(dones)) if self.use_cuda: states = states.cuda() actions = actions.cuda() rewards = rewards.cuda() new_states = new_states.cuda() dones = dones.cuda() # Compute Q(s_t, a) - the model computes Q(s_t), then we select the # columns of actions taken q_values = self.policy_network(states) next_q_values = self.policy_network(new_states) next_q_state_values = self.target_policy_network(new_states).detach() q_value = q_values.gather(1, actions.unsqueeze(1)).squeeze(1) next_q_value = next_q_state_values.gather( 1, torch.max(next_q_values, 1)[1].unsqueeze(1)).squeeze(1) expected_q_value = rewards + self.gamma * next_q_value * (1 - dones) expected_q_value = expected_q_value.detach() td_loss = F.smooth_l1_loss(q_value, expected_q_value) self.policy_optim.zero_grad() td_loss.backward() if clip_gradients: for param in self.policy_network.parameters(): param.grad.data.clamp_(-1, 1) self.policy_optim.step() return td_loss
def calc_td_error(self): """ Calculates the td error against the bellman target :return: """ # Calculate the TD error only for the particular transition # Get the separate values from the named tuple transitions = self.buffer.sample_batch(self.batch_size) batch = Buffer.Transition(*zip(*transitions)) state = batch.state new_state = batch.next_state action = batch.action reward = batch.reward done = batch.done #reward = list(reward) #done = list(done) state = Variable(torch.cat(state), volatile=True) new_state = Variable(torch.cat(new_state), volatile=True) action = Variable(torch.cat(action)) reward = Variable(torch.cat(reward)) done = Variable(torch.cat(done)) if self.use_cuda: state = state.cuda() action = action.cuda() reward = reward.cuda() new_state = new_state.cuda() done = done.cuda() q_values = self.current_model(state) next_q_values = self.current_model(new_state) next_q_state_values = self.target_model(new_state) q_value = q_values.gather(1, action.unsqueeze(1)).squeeze(1) next_q_value = next_q_state_values.gather( 1, torch.max(next_q_values, 1)[1].unsqueeze(1)).squeeze(1) expected_q_value = reward + self.gamma * next_q_value * (1 - done) loss = (q_value - Variable(expected_q_value.data)).pow(2).mean() self.optim.zero_grad() loss.backward() self.optim.step() return loss
def fit_batch_dqn(self): # Sample mini-batch from the replay buffer uniformly or from the prioritized experience replay. # If the size of the buffer is less than batch size then return if self.replay_buffer.get_buffer_size() < self.batch_size: return None transitions = self.dqn_replay_buffer.sample_batch(self.batch_size) batch = Buffer.Transition(*zip(*transitions)) # Get the separate values from the named tuple states = batch.state new_states = batch.next_state actions = batch.action rewards = batch.reward dones = batch.done states = Variable(torch.cat(states)) new_states = Variable(torch.cat(new_states), volatile=True) actions = Variable(torch.cat(actions)) rewards = Variable(torch.cat(rewards)) dones = Variable(torch.cat(dones)) if self.use_cuda: states = states.cuda() actions = actions.cuda() rewards = rewards.cuda() new_states = new_states.cuda() dones = dones.cuda() # Compute Q(s_t, a) - the model computes Q(s_t), then we select the # columns of actions taken # Encode the states and the new states states = self.encoder(states) new_states = self.encoder(new_states) state_action_values = self.policy_network(states).gather(1, actions) # Compute V(s_{t+1}) for all next states. next_state_values = self.target_policy_network(new_states).max(1)[0].detach() next_state_values = next_state_values * (1 - dones) y = rewards + self.gamma * next_state_values td_loss = F.smooth_l1_loss(state_action_values, y) self.policy_optim.zero_grad() td_loss.backward() for param in self.policy_network.parameters(): param.grad.data.clamp_(-1, 1) self.policy_optim.step() return td_loss
def __init__(self, actions, device, lr=1e-2, gamma=0.99, ppo_clip=0.2, ppo_epoch=5, batch_size=8): super(PPO).__init__() self.device = device self.policy_new = ActorCritic(actions) self.policy_old = ActorCritic(actions) self.policy_new.apply(init_weights) self.policy_old.apply(init_weights) self.optimizer = optim.Adam(self.policy_new.parameters(), lr=lr) self.policy_old.load_state_dict(self.policy_new.state_dict()) self.ppo_clip = ppo_clip self.ppo_epoch = ppo_epoch self.batch_size = batch_size self.memory = Buffer(device) self.max_timestamp_per_episode = 1000 self.gamma = gamma self.value_loss_coef = 0.5 self.MseLoss = nn.MSELoss()
def fit_batch(self): transitions = self.buffer.sample_batch(self.bs) batch = Buffer.Transition(*zip(*transitions)) # Get the separate values from the named tuple states = batch.state new_states = batch.next_state actions = batch.action rewards = batch.reward dones = batch.done states = Variable(torch.cat(states)) with torch.no_grad(): new_states = Variable(torch.cat(new_states)) actions = Variable(torch.cat(actions)) rewards = Variable(torch.cat(rewards)) dones = Variable(torch.cat(dones)) if self.use_cuda: states = states.cuda() actions = actions.cuda() rewards = rewards.cuda() new_states = new_states.cuda() dones = dones.cuda() value_loss, values = self.calc_soft_value_function_error(states) q_loss, q_values = self.calc_soft_q_function_error( states, actions, new_states, rewards, dones) policy_loss = self.calc_policy_loss(states, q_values, values) """ Update the networks """ self.value_optim.zero_grad() value_loss.backward() self.value_optim.step() self.critic_optim.zero_grad() q_loss.backward() self.critic_optim.zero_grad() self.actor_optim.zero_grad() policy_loss.backward() self.actor_optim.step() # Update the target networks self.update_target_networks() return value_loss, q_loss, policy_loss
def train_forward_dynamics(self, clamp_gradients=False, use_difference_representation=True): if self.replay_buffer.get_buffer_size() < self.batch_size: return None transitions = self.replay_buffer.sample_batch(self.batch_size) batch = Buffer.Transition(*zip(*transitions)) # Get the separate values from the named tuple states = batch.state new_states = batch.next_state actions = batch.action states = Variable(torch.cat(states)) new_states = Variable(torch.cat(new_states)) actions = Variable(torch.cat(actions)) if self.use_cuda: states = states.cuda() actions = actions.cuda() new_states = new_states.cuda() if use_difference_representation: # Under this representation, the model predicts the difference between the current state and the next state. diff_new_states = self.fwd(states, actions) predicted_new_states = states + diff_new_states else: predicted_new_states = self.fwd(states, actions) mse_error = F.smooth_l1_loss(predicted_new_states, new_states) self.fwd_optim.zero_grad() mse_error.backward() # Clamp the gradients if clamp_gradients: for param in self.fwd.parameters(): param.grad.data.clamp_(-1, 1) self.fwd_optim.step() return mse_error
def fit_batch(self): # Sample mini-batch from the buffer uniformly or using prioritized experience replay # If the size of the buffer is less than batch size then return if self.buffer.get_buffer_size() < self.batch_size: return None, None transitions = self.buffer.sample_batch(self.batch_size) batch = Buffer.Transition(*zip(*transitions)) # Get the separate values from the named tuple states = batch.state new_states = batch.next_state actions = batch.action rewards = batch.reward dones = batch.done #actions = list(actions) rewards = list(rewards) dones = list(dones) states = Variable(torch.cat(states)) new_states = Variable(torch.cat(new_states), volatile=True) actions = Variable(torch.cat(actions)) rewards = Variable(torch.cat(rewards)) dones = Variable(torch.cat(dones)) if self.cuda: states = states.cuda() actions = actions.cuda() rewards = rewards.cuda() new_states = new_states.cuda() dones = dones.cuda() # Step 2: Compute the target values using the target actor network and target critic network # Compute the Q-values given the current state ( in this case it is the new_states) #with torch.no_grad(): new_action = self.target_actor(new_states) new_action.volatile = True next_Q_values = self.target_critic(new_states, new_action) # Find the Q-value for the action according to the target actior network # We do this because calculating max over a continuous action space is intractable # next_Q_values.volatile = False next_Q_values = torch.squeeze(next_Q_values, dim=1) next_Q_values = next_Q_values * (1 - dones) next_Q_values.volatile = False y = rewards + self.gamma * next_Q_values # Zero the optimizer gradients self.actor_optim.zero_grad() self.critic_optim.zero_grad() # Forward pass outputs = self.critic(states, actions) loss = self.criterion(outputs, y) loss.backward() # Clamp the gradients to avoid vanishing gradient problem for param in self.critic.parameters(): param.grad.data.clamp_(-1, 1) self.critic_optim.step() # Updating the actor policy policy_loss = -1 * self.critic(states, self.actor(states)) policy_loss = policy_loss.mean() policy_loss.backward() # Clamp the gradients to avoid the vanishing gradient problem for param in self.actor.parameters(): param.grad.data.clamp_(-1, 1) self.actor_optim.step() return loss, policy_loss
def __init__(self, num_hidden_units, input_dim, num_actions, num_q_val, observation_dim, goal_dim, batch_size, use_cuda, gamma, random_seed, actor_optimizer, critic_optimizer, actor_learning_rate, critic_learning_rate, loss_function, polyak_constant, buffer_capacity, non_conv=True, num_conv_layers=None, num_pool_layers=None, conv_kernel_size=None, img_height=None, img_width=None, input_channels=None): self.num_hidden_units = num_hidden_units self.non_conv = non_conv self.num_actions = num_actions self.num_q = num_q_val self.obs_dim = observation_dim self.goal_dim = goal_dim self.input_dim = input_dim self.batch_size = batch_size self.cuda = use_cuda self.gamma = gamma self.seed(random_seed) self.actor_optim = actor_optimizer self.critic_optim = critic_optimizer self.actor_lr = actor_learning_rate self.critic_lr = critic_learning_rate self.criterion = loss_function self.tau = polyak_constant self.buffer = Buffer.ReplayBuffer(capacity=buffer_capacity, seed=random_seed) # Convolution Parameters self.num_conv = num_conv_layers self.pool = num_pool_layers self.im_height = img_height self.im_width = img_width self.conv_kernel_size = conv_kernel_size self.input_channels = input_channels if non_conv: self.target_actor = ActorDDPGNonConvNetwork( num_hidden_layers=num_hidden_units, output_action=num_actions, input=input_dim) self.actor = ActorDDPGNonConvNetwork( num_hidden_layers=num_hidden_units, output_action=num_actions, input=input_dim) self.target_critic = CriticDDPGNonConvNetwork( num_hidden_layers=num_hidden_units, output_q_value=num_q_val, input=input_dim, action_dim=num_actions, goal_dim=self.goal_dim) self.critic = CriticDDPGNonConvNetwork( num_hidden_layers=num_hidden_units, output_q_value=num_q_val, input=input_dim, action_dim=num_actions, goal_dim=self.goal_dim) else: self.target_actor = ActorDDPGNetwork( num_conv_layers=self.num_conv, conv_kernel_size=self.conv_kernel_size, input_channels=self.input_channels, output_action=self.num_actions, dense_layer=self.num_hidden_units, pool_kernel_size=self.pool, IMG_HEIGHT=self.im_height, IMG_WIDTH=self.im_width) self.actor = ActorDDPGNetwork( num_conv_layers=self.num_conv, conv_kernel_size=self.conv_kernel_size, input_channels=self.input_channels, output_action=self.num_actions, dense_layer=self.num_hidden_units, pool_kernel_size=self.pool, IMG_HEIGHT=self.im_height, IMG_WIDTH=self.im_width) self.target_critic = CriticDDPGNetwork( num_conv_layers=self.num_conv, conv_kernel_size=self.conv_kernel_size, input_channels=self.input_channels, output_q_value=self.num_q, dense_layer=self.num_hidden_units, pool_kernel_size=self.pool, IMG_HEIGHT=self.im_height, IMG_WIDTH=self.im_width) self.critic = CriticDDPGNetwork( num_conv_layers=self.num_conv, conv_kernel_size=self.conv_kernel_size, input_channels=self.input_channels, output_q_value=self.num_q, dense_layer=self.num_hidden_units, pool_kernel_size=self.pool, IMG_HEIGHT=self.im_height, IMG_WIDTH=self.im_width) if self.cuda: self.target_actor = self.target_actor.cuda() self.actor = self.actor.cuda() self.target_critic = self.target_critic.cuda() self.critic = self.critic.cuda() # Initializing the target networks with the standard network weights self.target_actor.load_state_dict(self.actor.state_dict()) self.target_critic.load_state_dict(self.critic.state_dict()) # Create the optimizers for the actor and critic using the corresponding learning rate actor_parameters = self.actor.parameters() critic_parameters = self.critic.parameters() self.actor_optim = opt.Adam(actor_parameters, lr=self.actor_lr) self.critic_optim = opt.Adam(critic_parameters, lr=self.critic_lr) # Initialize a random exploration noise self.random_noise = random_process.OrnsteinUhlenbeckActionNoise( self.num_actions)
def __init__( self, state_dim, action_dim, hidden_dim, actor, critic, value_network, target_value_network, polyak_constant, actor_learning_rate, critic_learning_rate, value_learning_rate, num_q_value, num_v_value, batch_size, gamma, random_seed, num_epochs, num_rollouts, num_eval_rollouts, env, eval_env, nb_train_steps, max_episodes_per_epoch, output_folder, use_cuda, buffer_capacity, policy_reg_mean_weight=1e-3, policy_reg_std_weight=1e-3, policy_preactivation_weight=0, verbose=True, plot_stats=False, ): self.state_dim = state_dim self.action_dim = action_dim self.hidden = hidden_dim self.q_dim = num_q_value self.v_dim = num_v_value self.actor = actor self.critic = critic self.value = value_network self.tau = polyak_constant self.bs = batch_size self.gamma = gamma self.seed = random_seed self.use_cuda = use_cuda self.buffer = Buffer.ReplayBuffer(capacity=buffer_capacity, seed=self.seed) self.policy_reg_mean_weight = policy_reg_mean_weight self.policy_reg_std_weight = policy_reg_std_weight self.policy_pre_activation_weight = policy_preactivation_weight # Training specific parameters self.num_epochs = num_epochs self.num_rollouts = num_rollouts self.num_eval_rollouts = num_eval_rollouts self.env = env self.eval_env = eval_env self.nb_train_steps = nb_train_steps self.max_episodes_per_epoch = max_episodes_per_epoch self.statistics = defaultdict(float) self.combined_statistics = defaultdict(list) self.verbose = verbose self.output_folder = output_folder self.plot_stats = plot_stats self.actor_optim = optim.Adam(lr=actor_learning_rate, params=self.actor.parameters()) self.critic_optim = optim.Adam(lr=critic_learning_rate, params=self.critic.parameters()) self.value_optim = optim.Adam(lr=value_learning_rate, params=self.value.parameters()) self.target_value = target_value_network if self.use_cuda: self.actor = self.actor.cuda() self.critic = self.critic.cuda() self.value = self.value.cuda() self.target_value = self.target_value.cuda() # Initializing the target networks with the standard network weights self.target_value.load_state_dict(self.value.state_dict()) # Initialize a random exploration noise self.random_noise = random_process.OrnsteinUhlenbeckActionNoise( self.action_dim)
def train(self): epoch_episode_rewards = [] # Initialize the training with an initial state state = self.env.reset() # Initialize the losses episode_reward = 0 # Check whether to use cuda or not state = to_tensor(state, use_cuda=self.use_cuda) fwd_loss = 0 stats_loss = 0 policy_loss = 0 # Mean rewards mean_rewards = [] with torch.no_grad(): state = self.encoder(state) state = state.detach() for frame_idx in range(1, self.num_frames + 1): epsilon_by_frame = epsilon_greedy_exploration() epsilon = epsilon_by_frame(frame_idx) action = self.policy_network.act(state, epsilon) # Execute the action next_state, reward, done, success = self.env.step(action.item()) episode_reward += reward reward = np.sign(reward) next_state = to_tensor(next_state, use_cuda=self.use_cuda) with torch.no_grad(): next_state = self.encoder(next_state) next_state = next_state.detach() reward = torch.tensor([reward], dtype=torch.float) done_bool = done * 1 done_bool = torch.tensor([done_bool], dtype=torch.float) # Store in the replay buffer self.store_transition(state=state, new_state=next_state, action=action, done=done_bool, reward=reward) state = next_state if done: epoch_episode_rewards.append(episode_reward) # Add episode reward to tensorboard episode_reward = 0 state = self.env.reset() state = to_tensor(state, use_cuda=self.use_cuda) state = self.encoder(state) # Train the forward dynamics model if len(self.replay_buffer) > self.fwd_limit: # Sample a minibatch from the replay buffer transitions = self.replay_buffer.sample_batch(self.batch_size) batch = Buffer.Transition(*zip(*transitions)) batch = self.get_train_variables(batch) mse_loss = self.train_forward_dynamics(batch=batch) fwd_loss += mse_loss.item() if frame_idx % self.print_every == 0: print('Forward Dynamics Loss :', fwd_loss / (frame_idx - self.fwd_limit)) # Train the statistics network and the policy if len(self.replay_buffer) > self.policy_limit: transitions = self.replay_buffer.sample_batch(self.batch_size) batch = Buffer.Transition(*zip(*transitions)) batch = self.get_train_variables(batch) loss, aug_rewards = self.train_statistics_network(batch=batch) p_loss = self.train_policy(batch=batch, rewards=aug_rewards) stats_loss += loss.item() policy_loss += p_loss.item() if frame_idx % self.print_every == 0: print('Statistics Loss: ', stats_loss / (frame_idx - self.policy_limit)) print('Policy Loss: ', policy_loss / (frame_idx - self.policy_limit)) # Print the statistics if self.verbose: if frame_idx % self.print_every == 0: print('Mean Reward ', str(np.mean(epoch_episode_rewards))) print('Sum of Rewards ', str(np.sum(epoch_episode_rewards))) mean_rewards.append(np.mean(epoch_episode_rewards)) if self.plot_stats: if frame_idx % self.plot_every == 0: # Plot the statistics calculated self.plot(frame_idx=frame_idx, rewards=epoch_episode_rewards, mean_rewards=mean_rewards, output_folder=self.output_folder, placeholder_name='/DQN_montezuma_intrinsic') # Update the target network if frame_idx % self.update_every == 0: self.update_networks() # Save the models and the rewards file if frame_idx % self.save_epoch == 0: self.save_m() self.save_rewards(ep_rewards=epoch_episode_rewards, mean_rewards=mean_rewards) self.save_m()
def __init__(self, env, encoder, forward_dynamics, statistics_network, target_policy_network, policy_network, forward_dynamics_lr, stats_lr, policy_lr, num_train_epochs, num_frames, num_fwd_train_steps, num_stats_train_steps, fwd_dynamics_limit, stats_network_limit, policy_limit, size_replay_buffer, random_seed, polyak_constant, discount_factor, batch_size, action_space, model_output_folder, save_epoch, target_stats_network=None, target_fwd_dynamics_network=None, clip_rewards=True, clip_augmented_rewards=False, print_every=2000, update_network_every=2000, plot_every=5000, intrinsic_param=0.01, non_episodic_intrinsic=True, use_mine_formulation=True, use_cuda=False, save_models=True, plot_stats=False, verbose=True): self.encoder = encoder self.fwd = forward_dynamics self.stats = statistics_network self.use_cuda = use_cuda self.policy_network = policy_network self.target_policy_network = target_policy_network self.output_folder = model_output_folder self.use_mine_formulation = use_mine_formulation self.env = env self.train_epochs = num_train_epochs self.num_frames = num_frames self.num_fwd_train_steps = num_fwd_train_steps self.num_stats_train_steps = num_stats_train_steps self.fwd_lr = forward_dynamics_lr self.stats_lr = stats_lr self.policy_lr = policy_lr self.random_seed = random_seed self.save_models = save_models self.plot_stats = plot_stats self.verbose = verbose self.intrinsic_param = intrinsic_param self.save_epoch = save_epoch self.clip_rewards = clip_rewards self.clip_augmented_rewards = clip_augmented_rewards self.max = torch.zeros(1) self.min = torch.zeros(1) self.fwd_limit = fwd_dynamics_limit self.stats_limit = stats_network_limit self.policy_limit = policy_limit self.print_every = print_every self.update_every = update_network_every self.plot_every = plot_every self.non_episodic = non_episodic_intrinsic self.statistics = defaultdict(float) self.combined_statistics = defaultdict(list) self.target_stats_network = target_stats_network self.target_fwd_dynamics_network = target_fwd_dynamics_network # Fix the encoder weights for param in self.encoder.parameters(): param.requires_grad = False self.replay_buffer = Buffer.ReplayBuffer(capacity=size_replay_buffer, seed=self.random_seed) self.tau = polyak_constant self.gamma = discount_factor self.batch_size = batch_size self.action_space = action_space torch.manual_seed(self.random_seed) if self.use_cuda: torch.cuda.manual_seed(self.random_seed) if self.use_cuda: self.encoder = self.encoder.cuda() self.invd = self.invd.cuda() self.fwd = self.fwd.cuda() self.policy_network = self.policy_network.cuda() self.source_distribution = self.source_distribution.cuda() self.fwd_optim = optim.Adam(params=self.fwd.parameters(), lr=self.fwd_lr) self.policy_optim = optim.Adam(params=self.policy_network.parameters(), lr=self.policy_lr) self.stats_optim = optim.Adam(params=self.stats.parameters(), lr=self.stats_lr) # Update the policy and target policy networks self.update_networks()
def __init__(self, env, encoder, inverse_dynamics, forward_dynamics, source_distribution, statistics_network, target_policy_network, policy_network, encoder_lr, inverse_dynamics_lr, forward_dynamics_lr, source_d_lr, stats_lr, policy_lr, num_train_epochs, num_epochs, num_rollouts, size_replay_buffer, size_dqn_replay_buffer, random_seed, polyak_constant, discount_factor, batch_size, action_space, observation_space, model_output_folder, train_encoder=False, use_mine_formulation=True, use_cuda=False): self.encoder = encoder self.invd = inverse_dynamics self.fwd = forward_dynamics self.source = source_distribution self.stats = statistics_network self.use_cuda = use_cuda self.policy_network = policy_network self.target_policy_network = target_policy_network self.model_output_folder = model_output_folder self.use_mine_formulation = use_mine_formulation self.env = env self.num_epochs = num_epochs self.train_epochs = num_train_epochs self.num_rollouts = num_rollouts self.e_lr = encoder_lr self.invd_lr = inverse_dynamics_lr self.fwd_lr = forward_dynamics_lr self.source_lr = source_d_lr self.stats_lr = stats_lr self.policy_lr = policy_lr self.random_seed = random_seed self.replay_buffer = Buffer.ReplayBuffer(capacity=size_replay_buffer, seed=self.random_seed) self.dqn_replay_buffer = Buffer.ReplayBuffer(capacity=size_dqn_replay_buffer, seed=self.random_seed) self.tau = polyak_constant self.gamma = discount_factor self.batch_size = batch_size self.action_space = action_space self.obs_space = observation_space torch.manual_seed(self.random_seed) if self.use_cuda: torch.cuda.manual_seed(self.random_seed) if self.use_cuda: self.encoder = self.encoder.cuda() self.invd = self.invd.cuda() self.fwd = self.fwd.cuda() self.policy_network = self.policy_network.cuda() self.source_distribution = self.source_distribution.cuda() # Define the optimizers if train_encoder: self.e_optim = optim.Adam(params=self.encoder.parameters(), lr=self.e_lr) self.invd_optim = optim.Adam(params=self.invd.parameters(), lr=self.invd_lr) self.fwd_optim = optim.Adam(params=self.fwd.parameters(), lr=self.fwd_lr) self.policy_optim = optim.Adam(params=self.policy_network.parameters(), lr=self.policy_lr) self.source_optim = optim.Adam(params=self.source_distribution.parameters(), lr=self.source_lr) self.stats_optim = optim.Adam(params=self.stats.parameters(), lr=self.stats_lr)
class PPO(object): def __init__(self, actions, device, lr=1e-2, gamma=0.99, ppo_clip=0.2, ppo_epoch=5, batch_size=8): super(PPO).__init__() self.device = device self.policy_new = ActorCritic(actions) self.policy_old = ActorCritic(actions) self.policy_new.apply(init_weights) self.policy_old.apply(init_weights) self.optimizer = optim.Adam(self.policy_new.parameters(), lr=lr) self.policy_old.load_state_dict(self.policy_new.state_dict()) self.ppo_clip = ppo_clip self.ppo_epoch = ppo_epoch self.batch_size = batch_size self.memory = Buffer(device) self.max_timestamp_per_episode = 1000 self.gamma = gamma self.value_loss_coef = 0.5 self.MseLoss = nn.MSELoss() def get_action_and_prob(self, state): state = torch.FloatTensor(state).unsqueeze(0).to(self.device) with torch.no_grad(): action_predictions, _ = self.policy_old(state) action_probabilities = F.softmax(action_predictions, dim=-1) distributions = Categorical(action_probabilities) action = distributions.sample() action_log_prob = distributions.log_prob(action) return action, action_log_prob def evaluate_policy(self, states, actions): action_predictions, value_pred = self.policy_new(states) action_probabilities = F.softmax(action_predictions, dim=-1) distributions = Categorical(action_probabilities) action_log_probs = distributions.log_prob(actions) return action_log_probs, value_pred def calculate_rewards(self, old_rewards): rewards = [] discounted_reward = 0 for reward in reversed(old_rewards): discounted_reward = reward + (self.gamma * discounted_reward) rewards.append(discounted_reward) rewards.reverse() rewards = torch.tensor(rewards).to(self.device) rewards = (rewards - rewards.mean()) / (rewards.std() + 1e-5) return rewards def calculate_advantage(self, discounted_rewards, critic_rewards): advantages = discounted_rewards - critic_rewards advantages = (advantages - advantages.mean()) / advantages.std() return advantages def run_episode(self, env, train): state = env.reset() total_rewards = 0 loss = 0 for i in range(self.max_timestamp_per_episode): if not train: env.render() action, action_log_prob = self.get_action_and_prob(state) next_state, reward, done, _ = env.step(action.item()) total_rewards += reward if train: self.memory.add_transition(state, action, reward, action_log_prob, done) state = next_state if done: break if train: loss = self.train_policy() self.memory.clear() return total_rewards, loss def train_policy(self): old_states, old_actions, old_probs, rewards, _ = self.memory.get_transitions( ) rewards = self.calculate_rewards(rewards) epoch_loss = [] for epoch in range(self.ppo_epoch): probs, value_pred = self.evaluate_policy(old_states, old_actions) ratio = (probs - old_probs).exp() advantages = self.calculate_advantage(rewards, value_pred.detach()) surr1 = ratio * advantages surr2 = torch.clamp(ratio, min=1.0 - self.ppo_clip, max=1.0 + self.ppo_clip) * advantages self.optimizer.zero_grad() loss = (-torch.min(surr1, surr2) + self.value_loss_coef * F.smooth_l1_loss(value_pred, rewards)).mean() loss.backward() self.optimizer.step() epoch_loss.append(loss.item()) self.policy_old.load_state_dict(self.policy_new.state_dict()) return np.mean(epoch_loss)
def train_statistics_network(self, use_jenson_shannon_divergence=True, use_target_forward_dynamics=False, use_target_stats_network=False, clamp_gradients=False): if self.replay_buffer.get_buffer_size() < self.batch_size: return None, None, None transitions = self.replay_buffer.sample_batch(self.batch_size) batch = Buffer.Transition(*zip(*transitions)) # Get the separate values from the named tuple states = batch.state new_states = batch.next_state actions = batch.action rewards = batch.reward dones = batch.done states = Variable(torch.cat(states)) new_states = Variable(torch.cat(new_states), requires_grad=False) actions = Variable(torch.cat(actions)) rewards = Variable(torch.cat(rewards)) dones = Variable(torch.cat(dones)) if self.use_cuda: states = states.cuda() actions = actions.cuda() rewards = rewards.cuda() new_states = new_states.cuda() dones = dones.cuda() all_actions = self.get_all_actions(self.action_space) all_actions = Variable(torch.cat(all_actions)) new_state_marginals = [] for state in states: state = state.expand(self.action_space, -1) if use_target_forward_dynamics: n_s = self.target_fwd(state, all_actions) else: n_s = self.fwd(state, all_actions) n_s = n_s.detach() n_s = n_s + state n_s = torch.mean(n_s, dim=0) n_s = torch.unsqueeze(n_s, dim=0) new_state_marginals.append(n_s) new_state_marginals = tuple(new_state_marginals) new_state_marginals = Variable(torch.cat(new_state_marginals), requires_grad=False) p_sa = self.stats(new_states, actions) p_s_a = self.stats(new_state_marginals, actions) p_s_ta = self.target_stats(new_states, actions) p_s_t_a = self.target_stats(new_state_marginals, actions) if use_jenson_shannon_divergence: # Improves stability and gradients are unbiased if use_target_stats_network: mutual_information = -F.softplus(-p_s_ta) - F.softplus(p_s_t_a) else: mutual_information = -F.softplus(-p_sa) - F.softplus(p_s_a) lower_bound = torch.mean(-F.softplus(-p_sa)) - torch.mean( F.softplus(p_s_a)) else: # Use KL Divergence if use_target_stats_network: mutual_information = p_s_ta - torch.log(torch.exp(p_s_t_a)) else: mutual_information = p_sa - torch.log(torch.exp(p_s_a)) lower_bound = torch.mean(p_sa) - torch.log( torch.mean(torch.exp(p_s_a))) # Maximize the mutual information loss = -lower_bound self.stats_optim.zero_grad() loss.backward() # Clamp the gradients if clamp_gradients: for param in self.stats.parameters(): param.grad.data.clamp_(-1, 1) self.stats_optim.step() # Store in the dqn replay buffer mutual_information = torch.squeeze(mutual_information, dim=-1) mutual_information = mutual_information.detach() rewards_combined = rewards + self.intrinsic_param * mutual_information # Store the updated reward transition in the replay buffer self.store_transition(state=states, action=actions, new_state=new_states, reward=rewards_combined, done=dones, buffer=self.dqn_replay_buffer) return loss, rewards, mutual_information, lower_bound