示例#1
0
    def train_statistics_network(self):

        if self.replay_buffer.get_buffer_size() < self.batch_size:
            return None

        transitions = self.replay_buffer.sample_batch(self.batch_size)
        batch = Buffer.Transition(*zip(*transitions))

        # Get the separate values from the named tuple
        states = batch.state
        new_states = batch.next_state
        actions = batch.action
        rewards = batch.reward
        dones = batch.done

        states = Variable(torch.cat(states))
        new_states = Variable(torch.cat(new_states), volatile=True)
        actions = Variable(torch.cat(actions))
        rewards = Variable(torch.cat(rewards))
        dones = Variable(torch.cat(dones))

        if self.use_cuda:
            states = states.cuda()
            actions = actions.cuda()
            rewards = rewards.cuda()
            new_states = new_states.cuda()
            dones = dones.cuda()

        all_actions = self.get_all_actions(self.action_space)
        all_actions = Variable(torch.cat(all_actions))

        new_state_marginals = []
        for state in states:
            state = state.expand(self.action_space, -1)
            new_states = self.fwd(state, all_actions)
            new_states = torch.mean(new_states)
            new_state_marginals.append(new_states)

        new_state_marginals = Variable(torch.cat(new_state_marginals))

        mutual_information = self.stats(new_states, actions) - \
                             torch.log(torch.exp(self.stats(new_state_marginals, actions)))

        # Maximize the mutual information
        loss = -mutual_information
        self.stats_optim.zero_grad()
        loss.backward()
        self.stats_optim.step()

        # Store in the dqn replay buffer

        rewards = rewards + mutual_information
        self.store_transition(buffer=self.dqn_replay_buffer,
                              state=states,
                              action=actions,
                              new_state=new_states,
                              reward=rewards,
                              done=dones, success=None)

        return loss
示例#2
0
    def train_forward_dynamics(self):

        if self.replay_buffer.get_buffer_size() < self.batch_size:
            return None

        transitions = self.replay_buffer.sample_batch(self.batch_size)
        batch = Buffer.Transition(*zip(*transitions))

        # Get the separate values from the named tuple
        states = batch.state
        new_states = batch.next_state
        actions = batch.action
        rewards = batch.reward
        dones = batch.done

        states = Variable(torch.cat(states))
        new_states = Variable(torch.cat(new_states), volatile=True)
        actions = Variable(torch.cat(actions))
        rewards = Variable(torch.cat(rewards))
        dones = Variable(torch.cat(dones))

        if self.use_cuda:
            states = states.cuda()
            actions = actions.cuda()
            rewards = rewards.cuda()
            new_states = new_states.cuda()
            dones = dones.cuda()

        predicted_new_states = self.fwd(states, actions)
        mse_error = F.mse_loss(predicted_new_states, new_states)
        self.fwd_optim.zero_grad()
        mse_error.backward()
        self.fwd_optim.step()

        return mse_error
示例#3
0
    def train_policy(self, clip_gradients=True):
        # Sample mini-batch from the replay buffer uniformly or from the prioritized experience replay.

        # If the size of the buffer is less than batch size then return
        if self.dqn_replay_buffer.get_buffer_size() < self.batch_size:
            return None

        transitions = self.dqn_replay_buffer.sample_batch(self.batch_size)
        batch = Buffer.Transition(*zip(*transitions))

        # Get the separate values from the named tuple
        states = batch.state
        new_states = batch.next_state
        actions = batch.action
        rewards = batch.reward
        dones = batch.done

        states = Variable(torch.cat(states))
        new_states = Variable(torch.cat(new_states), requires_grad=False)
        actions = Variable(torch.cat(actions))
        rewards = Variable(torch.cat(rewards))
        dones = Variable(torch.cat(dones))

        if self.use_cuda:
            states = states.cuda()
            actions = actions.cuda()
            rewards = rewards.cuda()
            new_states = new_states.cuda()
            dones = dones.cuda()

        # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
        # columns of actions taken

        q_values = self.policy_network(states)
        next_q_values = self.policy_network(new_states)

        next_q_state_values = self.target_policy_network(new_states).detach()

        q_value = q_values.gather(1, actions.unsqueeze(1)).squeeze(1)
        next_q_value = next_q_state_values.gather(
            1,
            torch.max(next_q_values, 1)[1].unsqueeze(1)).squeeze(1)
        expected_q_value = rewards + self.gamma * next_q_value * (1 - dones)
        expected_q_value = expected_q_value.detach()
        td_loss = F.smooth_l1_loss(q_value, expected_q_value)

        self.policy_optim.zero_grad()
        td_loss.backward()
        if clip_gradients:
            for param in self.policy_network.parameters():
                param.grad.data.clamp_(-1, 1)
        self.policy_optim.step()

        return td_loss
示例#4
0
    def calc_td_error(self):
        """
        Calculates the td error against the bellman target
        :return:
        """
        # Calculate the TD error only for the particular transition

        # Get the separate values from the named tuple
        transitions = self.buffer.sample_batch(self.batch_size)
        batch = Buffer.Transition(*zip(*transitions))

        state = batch.state
        new_state = batch.next_state
        action = batch.action
        reward = batch.reward
        done = batch.done

        #reward = list(reward)
        #done = list(done)

        state = Variable(torch.cat(state), volatile=True)
        new_state = Variable(torch.cat(new_state), volatile=True)
        action = Variable(torch.cat(action))
        reward = Variable(torch.cat(reward))
        done = Variable(torch.cat(done))

        if self.use_cuda:
            state = state.cuda()
            action = action.cuda()
            reward = reward.cuda()
            new_state = new_state.cuda()
            done = done.cuda()

        q_values = self.current_model(state)
        next_q_values = self.current_model(new_state)
        next_q_state_values = self.target_model(new_state)

        q_value = q_values.gather(1, action.unsqueeze(1)).squeeze(1)
        next_q_value = next_q_state_values.gather(
            1,
            torch.max(next_q_values, 1)[1].unsqueeze(1)).squeeze(1)
        expected_q_value = reward + self.gamma * next_q_value * (1 - done)

        loss = (q_value - Variable(expected_q_value.data)).pow(2).mean()

        self.optim.zero_grad()
        loss.backward()
        self.optim.step()

        return loss
示例#5
0
    def fit_batch_dqn(self):
        # Sample mini-batch from the replay buffer uniformly or from the prioritized experience replay.

        # If the size of the buffer is less than batch size then return
        if self.replay_buffer.get_buffer_size() < self.batch_size:
            return None

        transitions = self.dqn_replay_buffer.sample_batch(self.batch_size)
        batch = Buffer.Transition(*zip(*transitions))

        # Get the separate values from the named tuple
        states = batch.state
        new_states = batch.next_state
        actions = batch.action
        rewards = batch.reward
        dones = batch.done

        states = Variable(torch.cat(states))
        new_states = Variable(torch.cat(new_states), volatile=True)
        actions = Variable(torch.cat(actions))
        rewards = Variable(torch.cat(rewards))
        dones = Variable(torch.cat(dones))

        if self.use_cuda:
            states = states.cuda()
            actions = actions.cuda()
            rewards = rewards.cuda()
            new_states = new_states.cuda()
            dones = dones.cuda()

        # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
        # columns of actions taken

        # Encode the states and the new states
        states = self.encoder(states)
        new_states = self.encoder(new_states)
        state_action_values = self.policy_network(states).gather(1, actions)
        # Compute V(s_{t+1}) for all next states.
        next_state_values = self.target_policy_network(new_states).max(1)[0].detach()
        next_state_values = next_state_values * (1 - dones)
        y = rewards + self.gamma * next_state_values
        td_loss = F.smooth_l1_loss(state_action_values, y)

        self.policy_optim.zero_grad()
        td_loss.backward()
        for param in self.policy_network.parameters():
            param.grad.data.clamp_(-1, 1)
        self.policy_optim.step()

        return td_loss
示例#6
0
    def fit_batch(self):
        transitions = self.buffer.sample_batch(self.bs)
        batch = Buffer.Transition(*zip(*transitions))
        # Get the separate values from the named tuple
        states = batch.state
        new_states = batch.next_state
        actions = batch.action
        rewards = batch.reward
        dones = batch.done

        states = Variable(torch.cat(states))
        with torch.no_grad():
            new_states = Variable(torch.cat(new_states))
        actions = Variable(torch.cat(actions))
        rewards = Variable(torch.cat(rewards))
        dones = Variable(torch.cat(dones))

        if self.use_cuda:
            states = states.cuda()
            actions = actions.cuda()
            rewards = rewards.cuda()
            new_states = new_states.cuda()
            dones = dones.cuda()

        value_loss, values = self.calc_soft_value_function_error(states)
        q_loss, q_values = self.calc_soft_q_function_error(
            states, actions, new_states, rewards, dones)
        policy_loss = self.calc_policy_loss(states, q_values, values)
        """
        Update the networks
        """
        self.value_optim.zero_grad()
        value_loss.backward()
        self.value_optim.step()

        self.critic_optim.zero_grad()
        q_loss.backward()
        self.critic_optim.zero_grad()

        self.actor_optim.zero_grad()
        policy_loss.backward()
        self.actor_optim.step()

        # Update the target networks
        self.update_target_networks()

        return value_loss, q_loss, policy_loss
示例#7
0
    def train_forward_dynamics(self,
                               clamp_gradients=False,
                               use_difference_representation=True):

        if self.replay_buffer.get_buffer_size() < self.batch_size:
            return None

        transitions = self.replay_buffer.sample_batch(self.batch_size)
        batch = Buffer.Transition(*zip(*transitions))

        # Get the separate values from the named tuple
        states = batch.state
        new_states = batch.next_state
        actions = batch.action
        states = Variable(torch.cat(states))
        new_states = Variable(torch.cat(new_states))
        actions = Variable(torch.cat(actions))

        if self.use_cuda:
            states = states.cuda()
            actions = actions.cuda()
            new_states = new_states.cuda()

        if use_difference_representation:
            # Under this representation, the model predicts the difference between the current state and the next state.
            diff_new_states = self.fwd(states, actions)
            predicted_new_states = states + diff_new_states
        else:
            predicted_new_states = self.fwd(states, actions)

        mse_error = F.smooth_l1_loss(predicted_new_states, new_states)
        self.fwd_optim.zero_grad()
        mse_error.backward()
        # Clamp the gradients
        if clamp_gradients:
            for param in self.fwd.parameters():
                param.grad.data.clamp_(-1, 1)
        self.fwd_optim.step()

        return mse_error
示例#8
0
    def fit_batch(self):
        # Sample mini-batch from the buffer uniformly or using prioritized experience replay

        # If the size of the buffer is less than batch size then return
        if self.buffer.get_buffer_size() < self.batch_size:
            return None, None

        transitions = self.buffer.sample_batch(self.batch_size)
        batch = Buffer.Transition(*zip(*transitions))

        # Get the separate values from the named tuple
        states = batch.state
        new_states = batch.next_state
        actions = batch.action
        rewards = batch.reward
        dones = batch.done

        #actions = list(actions)
        rewards = list(rewards)
        dones = list(dones)

        states = Variable(torch.cat(states))
        new_states = Variable(torch.cat(new_states), volatile=True)
        actions = Variable(torch.cat(actions))
        rewards = Variable(torch.cat(rewards))
        dones = Variable(torch.cat(dones))

        if self.cuda:
            states = states.cuda()
            actions = actions.cuda()
            rewards = rewards.cuda()
            new_states = new_states.cuda()
            dones = dones.cuda()

        # Step 2: Compute the target values using the target actor network and target critic network
        # Compute the Q-values given the current state ( in this case it is the new_states)
        #with torch.no_grad():

        new_action = self.target_actor(new_states)
        new_action.volatile = True
        next_Q_values = self.target_critic(new_states, new_action)
        # Find the Q-value for the action according to the target actior network
        # We do this because calculating max over a continuous action space is intractable
        # next_Q_values.volatile = False
        next_Q_values = torch.squeeze(next_Q_values, dim=1)
        next_Q_values = next_Q_values * (1 - dones)
        next_Q_values.volatile = False
        y = rewards + self.gamma * next_Q_values

        # Zero the optimizer gradients
        self.actor_optim.zero_grad()
        self.critic_optim.zero_grad()

        # Forward pass
        outputs = self.critic(states, actions)
        loss = self.criterion(outputs, y)
        loss.backward()
        # Clamp the gradients to avoid vanishing gradient problem
        for param in self.critic.parameters():
            param.grad.data.clamp_(-1, 1)
        self.critic_optim.step()

        # Updating the actor policy
        policy_loss = -1 * self.critic(states, self.actor(states))
        policy_loss = policy_loss.mean()
        policy_loss.backward()
        # Clamp the gradients to avoid the vanishing gradient problem
        for param in self.actor.parameters():
            param.grad.data.clamp_(-1, 1)
        self.actor_optim.step()

        return loss, policy_loss
    def train(self):
        epoch_episode_rewards = []

        # Initialize the training with an initial state
        state = self.env.reset()

        # Initialize the losses
        episode_reward = 0
        # Check whether to use cuda or not
        state = to_tensor(state, use_cuda=self.use_cuda)

        fwd_loss = 0
        stats_loss = 0
        policy_loss = 0

        # Mean rewards
        mean_rewards = []
        with torch.no_grad():
            state = self.encoder(state)
        state = state.detach()

        for frame_idx in range(1, self.num_frames + 1):
            epsilon_by_frame = epsilon_greedy_exploration()
            epsilon = epsilon_by_frame(frame_idx)
            action = self.policy_network.act(state, epsilon)

            # Execute the action
            next_state, reward, done, success = self.env.step(action.item())
            episode_reward += reward

            reward = np.sign(reward)

            next_state = to_tensor(next_state, use_cuda=self.use_cuda)
            with torch.no_grad():
                next_state = self.encoder(next_state)

            next_state = next_state.detach()

            reward = torch.tensor([reward], dtype=torch.float)

            done_bool = done * 1
            done_bool = torch.tensor([done_bool], dtype=torch.float)

            # Store in the replay buffer
            self.store_transition(state=state,
                                  new_state=next_state,
                                  action=action,
                                  done=done_bool,
                                  reward=reward)

            state = next_state

            if done:
                epoch_episode_rewards.append(episode_reward)
                # Add episode reward to tensorboard
                episode_reward = 0
                state = self.env.reset()
                state = to_tensor(state, use_cuda=self.use_cuda)
                state = self.encoder(state)

            # Train the forward dynamics model
            if len(self.replay_buffer) > self.fwd_limit:
                # Sample a minibatch from the replay buffer
                transitions = self.replay_buffer.sample_batch(self.batch_size)
                batch = Buffer.Transition(*zip(*transitions))
                batch = self.get_train_variables(batch)
                mse_loss = self.train_forward_dynamics(batch=batch)
                fwd_loss += mse_loss.item()
                if frame_idx % self.print_every == 0:
                    print('Forward Dynamics Loss :',
                          fwd_loss / (frame_idx - self.fwd_limit))

            # Train the statistics network and the policy
            if len(self.replay_buffer) > self.policy_limit:
                transitions = self.replay_buffer.sample_batch(self.batch_size)
                batch = Buffer.Transition(*zip(*transitions))
                batch = self.get_train_variables(batch)
                loss, aug_rewards = self.train_statistics_network(batch=batch)

                p_loss = self.train_policy(batch=batch, rewards=aug_rewards)

                stats_loss += loss.item()
                policy_loss += p_loss.item()

                if frame_idx % self.print_every == 0:
                    print('Statistics Loss: ',
                          stats_loss / (frame_idx - self.policy_limit))
                    print('Policy Loss: ',
                          policy_loss / (frame_idx - self.policy_limit))

            # Print the statistics
            if self.verbose:
                if frame_idx % self.print_every == 0:
                    print('Mean Reward ', str(np.mean(epoch_episode_rewards)))
                    print('Sum of Rewards ',
                          str(np.sum(epoch_episode_rewards)))
                    mean_rewards.append(np.mean(epoch_episode_rewards))

            if self.plot_stats:
                if frame_idx % self.plot_every == 0:
                    # Plot the statistics calculated
                    self.plot(frame_idx=frame_idx,
                              rewards=epoch_episode_rewards,
                              mean_rewards=mean_rewards,
                              output_folder=self.output_folder,
                              placeholder_name='/DQN_montezuma_intrinsic')

            # Update the target network
            if frame_idx % self.update_every == 0:
                self.update_networks()

            # Save the models and the rewards file
            if frame_idx % self.save_epoch == 0:
                self.save_m()
                self.save_rewards(ep_rewards=epoch_episode_rewards,
                                  mean_rewards=mean_rewards)

        self.save_m()
示例#10
0
    def train_statistics_network(self,
                                 use_jenson_shannon_divergence=True,
                                 use_target_forward_dynamics=False,
                                 use_target_stats_network=False,
                                 clamp_gradients=False):

        if self.replay_buffer.get_buffer_size() < self.batch_size:
            return None, None, None

        transitions = self.replay_buffer.sample_batch(self.batch_size)
        batch = Buffer.Transition(*zip(*transitions))

        # Get the separate values from the named tuple
        states = batch.state
        new_states = batch.next_state
        actions = batch.action
        rewards = batch.reward
        dones = batch.done

        states = Variable(torch.cat(states))
        new_states = Variable(torch.cat(new_states), requires_grad=False)
        actions = Variable(torch.cat(actions))
        rewards = Variable(torch.cat(rewards))
        dones = Variable(torch.cat(dones))

        if self.use_cuda:
            states = states.cuda()
            actions = actions.cuda()
            rewards = rewards.cuda()
            new_states = new_states.cuda()
            dones = dones.cuda()

        all_actions = self.get_all_actions(self.action_space)
        all_actions = Variable(torch.cat(all_actions))

        new_state_marginals = []
        for state in states:
            state = state.expand(self.action_space, -1)
            if use_target_forward_dynamics:
                n_s = self.target_fwd(state, all_actions)
            else:
                n_s = self.fwd(state, all_actions)
            n_s = n_s.detach()
            n_s = n_s + state
            n_s = torch.mean(n_s, dim=0)
            n_s = torch.unsqueeze(n_s, dim=0)
            new_state_marginals.append(n_s)

        new_state_marginals = tuple(new_state_marginals)
        new_state_marginals = Variable(torch.cat(new_state_marginals),
                                       requires_grad=False)

        p_sa = self.stats(new_states, actions)
        p_s_a = self.stats(new_state_marginals, actions)

        p_s_ta = self.target_stats(new_states, actions)
        p_s_t_a = self.target_stats(new_state_marginals, actions)

        if use_jenson_shannon_divergence:
            # Improves stability and gradients are unbiased
            if use_target_stats_network:
                mutual_information = -F.softplus(-p_s_ta) - F.softplus(p_s_t_a)
            else:
                mutual_information = -F.softplus(-p_sa) - F.softplus(p_s_a)
            lower_bound = torch.mean(-F.softplus(-p_sa)) - torch.mean(
                F.softplus(p_s_a))
        else:
            # Use KL Divergence
            if use_target_stats_network:
                mutual_information = p_s_ta - torch.log(torch.exp(p_s_t_a))
            else:
                mutual_information = p_sa - torch.log(torch.exp(p_s_a))
            lower_bound = torch.mean(p_sa) - torch.log(
                torch.mean(torch.exp(p_s_a)))

        # Maximize the mutual information
        loss = -lower_bound
        self.stats_optim.zero_grad()
        loss.backward()
        # Clamp the gradients
        if clamp_gradients:
            for param in self.stats.parameters():
                param.grad.data.clamp_(-1, 1)
        self.stats_optim.step()

        # Store in the dqn replay buffer

        mutual_information = torch.squeeze(mutual_information, dim=-1)
        mutual_information = mutual_information.detach()

        rewards_combined = rewards + self.intrinsic_param * mutual_information
        # Store the updated reward transition in the replay buffer
        self.store_transition(state=states,
                              action=actions,
                              new_state=new_states,
                              reward=rewards_combined,
                              done=dones,
                              buffer=self.dqn_replay_buffer)

        return loss, rewards, mutual_information, lower_bound