示例#1
0
    def train_forward_dynamics(self):

        if self.replay_buffer.get_buffer_size() < self.batch_size:
            return None

        transitions = self.replay_buffer.sample_batch(self.batch_size)
        batch = Buffer.Transition(*zip(*transitions))

        # Get the separate values from the named tuple
        states = batch.state
        new_states = batch.next_state
        actions = batch.action
        rewards = batch.reward
        dones = batch.done

        states = Variable(torch.cat(states))
        new_states = Variable(torch.cat(new_states), volatile=True)
        actions = Variable(torch.cat(actions))
        rewards = Variable(torch.cat(rewards))
        dones = Variable(torch.cat(dones))

        if self.use_cuda:
            states = states.cuda()
            actions = actions.cuda()
            rewards = rewards.cuda()
            new_states = new_states.cuda()
            dones = dones.cuda()

        predicted_new_states = self.fwd(states, actions)
        mse_error = F.mse_loss(predicted_new_states, new_states)
        self.fwd_optim.zero_grad()
        mse_error.backward()
        self.fwd_optim.step()

        return mse_error
示例#2
0
    def train_policy(self, clip_gradients=True):
        # Sample mini-batch from the replay buffer uniformly or from the prioritized experience replay.

        # If the size of the buffer is less than batch size then return
        if self.dqn_replay_buffer.get_buffer_size() < self.batch_size:
            return None

        transitions = self.dqn_replay_buffer.sample_batch(self.batch_size)
        batch = Buffer.Transition(*zip(*transitions))

        # Get the separate values from the named tuple
        states = batch.state
        new_states = batch.next_state
        actions = batch.action
        rewards = batch.reward
        dones = batch.done

        states = Variable(torch.cat(states))
        new_states = Variable(torch.cat(new_states), requires_grad=False)
        actions = Variable(torch.cat(actions))
        rewards = Variable(torch.cat(rewards))
        dones = Variable(torch.cat(dones))

        if self.use_cuda:
            states = states.cuda()
            actions = actions.cuda()
            rewards = rewards.cuda()
            new_states = new_states.cuda()
            dones = dones.cuda()

        # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
        # columns of actions taken

        q_values = self.policy_network(states)
        next_q_values = self.policy_network(new_states)

        next_q_state_values = self.target_policy_network(new_states).detach()

        q_value = q_values.gather(1, actions.unsqueeze(1)).squeeze(1)
        next_q_value = next_q_state_values.gather(
            1,
            torch.max(next_q_values, 1)[1].unsqueeze(1)).squeeze(1)
        expected_q_value = rewards + self.gamma * next_q_value * (1 - dones)
        expected_q_value = expected_q_value.detach()
        td_loss = F.smooth_l1_loss(q_value, expected_q_value)

        self.policy_optim.zero_grad()
        td_loss.backward()
        if clip_gradients:
            for param in self.policy_network.parameters():
                param.grad.data.clamp_(-1, 1)
        self.policy_optim.step()

        return td_loss
示例#3
0
    def calc_td_error(self):
        """
        Calculates the td error against the bellman target
        :return:
        """
        # Calculate the TD error only for the particular transition

        # Get the separate values from the named tuple
        transitions = self.buffer.sample_batch(self.batch_size)
        batch = Buffer.Transition(*zip(*transitions))

        state = batch.state
        new_state = batch.next_state
        action = batch.action
        reward = batch.reward
        done = batch.done

        #reward = list(reward)
        #done = list(done)

        state = Variable(torch.cat(state), volatile=True)
        new_state = Variable(torch.cat(new_state), volatile=True)
        action = Variable(torch.cat(action))
        reward = Variable(torch.cat(reward))
        done = Variable(torch.cat(done))

        if self.use_cuda:
            state = state.cuda()
            action = action.cuda()
            reward = reward.cuda()
            new_state = new_state.cuda()
            done = done.cuda()

        q_values = self.current_model(state)
        next_q_values = self.current_model(new_state)
        next_q_state_values = self.target_model(new_state)

        q_value = q_values.gather(1, action.unsqueeze(1)).squeeze(1)
        next_q_value = next_q_state_values.gather(
            1,
            torch.max(next_q_values, 1)[1].unsqueeze(1)).squeeze(1)
        expected_q_value = reward + self.gamma * next_q_value * (1 - done)

        loss = (q_value - Variable(expected_q_value.data)).pow(2).mean()

        self.optim.zero_grad()
        loss.backward()
        self.optim.step()

        return loss
示例#4
0
    def fit_batch_dqn(self):
        # Sample mini-batch from the replay buffer uniformly or from the prioritized experience replay.

        # If the size of the buffer is less than batch size then return
        if self.replay_buffer.get_buffer_size() < self.batch_size:
            return None

        transitions = self.dqn_replay_buffer.sample_batch(self.batch_size)
        batch = Buffer.Transition(*zip(*transitions))

        # Get the separate values from the named tuple
        states = batch.state
        new_states = batch.next_state
        actions = batch.action
        rewards = batch.reward
        dones = batch.done

        states = Variable(torch.cat(states))
        new_states = Variable(torch.cat(new_states), volatile=True)
        actions = Variable(torch.cat(actions))
        rewards = Variable(torch.cat(rewards))
        dones = Variable(torch.cat(dones))

        if self.use_cuda:
            states = states.cuda()
            actions = actions.cuda()
            rewards = rewards.cuda()
            new_states = new_states.cuda()
            dones = dones.cuda()

        # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
        # columns of actions taken

        # Encode the states and the new states
        states = self.encoder(states)
        new_states = self.encoder(new_states)
        state_action_values = self.policy_network(states).gather(1, actions)
        # Compute V(s_{t+1}) for all next states.
        next_state_values = self.target_policy_network(new_states).max(1)[0].detach()
        next_state_values = next_state_values * (1 - dones)
        y = rewards + self.gamma * next_state_values
        td_loss = F.smooth_l1_loss(state_action_values, y)

        self.policy_optim.zero_grad()
        td_loss.backward()
        for param in self.policy_network.parameters():
            param.grad.data.clamp_(-1, 1)
        self.policy_optim.step()

        return td_loss
 def __init__(self,
              actions,
              device,
              lr=1e-2,
              gamma=0.99,
              ppo_clip=0.2,
              ppo_epoch=5,
              batch_size=8):
     super(PPO).__init__()
     self.device = device
     self.policy_new = ActorCritic(actions)
     self.policy_old = ActorCritic(actions)
     self.policy_new.apply(init_weights)
     self.policy_old.apply(init_weights)
     self.optimizer = optim.Adam(self.policy_new.parameters(), lr=lr)
     self.policy_old.load_state_dict(self.policy_new.state_dict())
     self.ppo_clip = ppo_clip
     self.ppo_epoch = ppo_epoch
     self.batch_size = batch_size
     self.memory = Buffer(device)
     self.max_timestamp_per_episode = 1000
     self.gamma = gamma
     self.value_loss_coef = 0.5
     self.MseLoss = nn.MSELoss()
示例#6
0
    def fit_batch(self):
        transitions = self.buffer.sample_batch(self.bs)
        batch = Buffer.Transition(*zip(*transitions))
        # Get the separate values from the named tuple
        states = batch.state
        new_states = batch.next_state
        actions = batch.action
        rewards = batch.reward
        dones = batch.done

        states = Variable(torch.cat(states))
        with torch.no_grad():
            new_states = Variable(torch.cat(new_states))
        actions = Variable(torch.cat(actions))
        rewards = Variable(torch.cat(rewards))
        dones = Variable(torch.cat(dones))

        if self.use_cuda:
            states = states.cuda()
            actions = actions.cuda()
            rewards = rewards.cuda()
            new_states = new_states.cuda()
            dones = dones.cuda()

        value_loss, values = self.calc_soft_value_function_error(states)
        q_loss, q_values = self.calc_soft_q_function_error(
            states, actions, new_states, rewards, dones)
        policy_loss = self.calc_policy_loss(states, q_values, values)
        """
        Update the networks
        """
        self.value_optim.zero_grad()
        value_loss.backward()
        self.value_optim.step()

        self.critic_optim.zero_grad()
        q_loss.backward()
        self.critic_optim.zero_grad()

        self.actor_optim.zero_grad()
        policy_loss.backward()
        self.actor_optim.step()

        # Update the target networks
        self.update_target_networks()

        return value_loss, q_loss, policy_loss
示例#7
0
    def train_forward_dynamics(self,
                               clamp_gradients=False,
                               use_difference_representation=True):

        if self.replay_buffer.get_buffer_size() < self.batch_size:
            return None

        transitions = self.replay_buffer.sample_batch(self.batch_size)
        batch = Buffer.Transition(*zip(*transitions))

        # Get the separate values from the named tuple
        states = batch.state
        new_states = batch.next_state
        actions = batch.action
        states = Variable(torch.cat(states))
        new_states = Variable(torch.cat(new_states))
        actions = Variable(torch.cat(actions))

        if self.use_cuda:
            states = states.cuda()
            actions = actions.cuda()
            new_states = new_states.cuda()

        if use_difference_representation:
            # Under this representation, the model predicts the difference between the current state and the next state.
            diff_new_states = self.fwd(states, actions)
            predicted_new_states = states + diff_new_states
        else:
            predicted_new_states = self.fwd(states, actions)

        mse_error = F.smooth_l1_loss(predicted_new_states, new_states)
        self.fwd_optim.zero_grad()
        mse_error.backward()
        # Clamp the gradients
        if clamp_gradients:
            for param in self.fwd.parameters():
                param.grad.data.clamp_(-1, 1)
        self.fwd_optim.step()

        return mse_error
示例#8
0
    def fit_batch(self):
        # Sample mini-batch from the buffer uniformly or using prioritized experience replay

        # If the size of the buffer is less than batch size then return
        if self.buffer.get_buffer_size() < self.batch_size:
            return None, None

        transitions = self.buffer.sample_batch(self.batch_size)
        batch = Buffer.Transition(*zip(*transitions))

        # Get the separate values from the named tuple
        states = batch.state
        new_states = batch.next_state
        actions = batch.action
        rewards = batch.reward
        dones = batch.done

        #actions = list(actions)
        rewards = list(rewards)
        dones = list(dones)

        states = Variable(torch.cat(states))
        new_states = Variable(torch.cat(new_states), volatile=True)
        actions = Variable(torch.cat(actions))
        rewards = Variable(torch.cat(rewards))
        dones = Variable(torch.cat(dones))

        if self.cuda:
            states = states.cuda()
            actions = actions.cuda()
            rewards = rewards.cuda()
            new_states = new_states.cuda()
            dones = dones.cuda()

        # Step 2: Compute the target values using the target actor network and target critic network
        # Compute the Q-values given the current state ( in this case it is the new_states)
        #with torch.no_grad():

        new_action = self.target_actor(new_states)
        new_action.volatile = True
        next_Q_values = self.target_critic(new_states, new_action)
        # Find the Q-value for the action according to the target actior network
        # We do this because calculating max over a continuous action space is intractable
        # next_Q_values.volatile = False
        next_Q_values = torch.squeeze(next_Q_values, dim=1)
        next_Q_values = next_Q_values * (1 - dones)
        next_Q_values.volatile = False
        y = rewards + self.gamma * next_Q_values

        # Zero the optimizer gradients
        self.actor_optim.zero_grad()
        self.critic_optim.zero_grad()

        # Forward pass
        outputs = self.critic(states, actions)
        loss = self.criterion(outputs, y)
        loss.backward()
        # Clamp the gradients to avoid vanishing gradient problem
        for param in self.critic.parameters():
            param.grad.data.clamp_(-1, 1)
        self.critic_optim.step()

        # Updating the actor policy
        policy_loss = -1 * self.critic(states, self.actor(states))
        policy_loss = policy_loss.mean()
        policy_loss.backward()
        # Clamp the gradients to avoid the vanishing gradient problem
        for param in self.actor.parameters():
            param.grad.data.clamp_(-1, 1)
        self.actor_optim.step()

        return loss, policy_loss
示例#9
0
    def __init__(self,
                 num_hidden_units,
                 input_dim,
                 num_actions,
                 num_q_val,
                 observation_dim,
                 goal_dim,
                 batch_size,
                 use_cuda,
                 gamma,
                 random_seed,
                 actor_optimizer,
                 critic_optimizer,
                 actor_learning_rate,
                 critic_learning_rate,
                 loss_function,
                 polyak_constant,
                 buffer_capacity,
                 non_conv=True,
                 num_conv_layers=None,
                 num_pool_layers=None,
                 conv_kernel_size=None,
                 img_height=None,
                 img_width=None,
                 input_channels=None):

        self.num_hidden_units = num_hidden_units
        self.non_conv = non_conv
        self.num_actions = num_actions
        self.num_q = num_q_val
        self.obs_dim = observation_dim
        self.goal_dim = goal_dim
        self.input_dim = input_dim
        self.batch_size = batch_size
        self.cuda = use_cuda
        self.gamma = gamma
        self.seed(random_seed)
        self.actor_optim = actor_optimizer
        self.critic_optim = critic_optimizer
        self.actor_lr = actor_learning_rate
        self.critic_lr = critic_learning_rate
        self.criterion = loss_function
        self.tau = polyak_constant
        self.buffer = Buffer.ReplayBuffer(capacity=buffer_capacity,
                                          seed=random_seed)

        # Convolution Parameters
        self.num_conv = num_conv_layers
        self.pool = num_pool_layers
        self.im_height = img_height
        self.im_width = img_width
        self.conv_kernel_size = conv_kernel_size
        self.input_channels = input_channels

        if non_conv:
            self.target_actor = ActorDDPGNonConvNetwork(
                num_hidden_layers=num_hidden_units,
                output_action=num_actions,
                input=input_dim)

            self.actor = ActorDDPGNonConvNetwork(
                num_hidden_layers=num_hidden_units,
                output_action=num_actions,
                input=input_dim)

            self.target_critic = CriticDDPGNonConvNetwork(
                num_hidden_layers=num_hidden_units,
                output_q_value=num_q_val,
                input=input_dim,
                action_dim=num_actions,
                goal_dim=self.goal_dim)
            self.critic = CriticDDPGNonConvNetwork(
                num_hidden_layers=num_hidden_units,
                output_q_value=num_q_val,
                input=input_dim,
                action_dim=num_actions,
                goal_dim=self.goal_dim)

        else:
            self.target_actor = ActorDDPGNetwork(
                num_conv_layers=self.num_conv,
                conv_kernel_size=self.conv_kernel_size,
                input_channels=self.input_channels,
                output_action=self.num_actions,
                dense_layer=self.num_hidden_units,
                pool_kernel_size=self.pool,
                IMG_HEIGHT=self.im_height,
                IMG_WIDTH=self.im_width)

            self.actor = ActorDDPGNetwork(
                num_conv_layers=self.num_conv,
                conv_kernel_size=self.conv_kernel_size,
                input_channels=self.input_channels,
                output_action=self.num_actions,
                dense_layer=self.num_hidden_units,
                pool_kernel_size=self.pool,
                IMG_HEIGHT=self.im_height,
                IMG_WIDTH=self.im_width)

            self.target_critic = CriticDDPGNetwork(
                num_conv_layers=self.num_conv,
                conv_kernel_size=self.conv_kernel_size,
                input_channels=self.input_channels,
                output_q_value=self.num_q,
                dense_layer=self.num_hidden_units,
                pool_kernel_size=self.pool,
                IMG_HEIGHT=self.im_height,
                IMG_WIDTH=self.im_width)
            self.critic = CriticDDPGNetwork(
                num_conv_layers=self.num_conv,
                conv_kernel_size=self.conv_kernel_size,
                input_channels=self.input_channels,
                output_q_value=self.num_q,
                dense_layer=self.num_hidden_units,
                pool_kernel_size=self.pool,
                IMG_HEIGHT=self.im_height,
                IMG_WIDTH=self.im_width)
        if self.cuda:
            self.target_actor = self.target_actor.cuda()
            self.actor = self.actor.cuda()
            self.target_critic = self.target_critic.cuda()
            self.critic = self.critic.cuda()

        # Initializing the target networks with the standard network weights
        self.target_actor.load_state_dict(self.actor.state_dict())
        self.target_critic.load_state_dict(self.critic.state_dict())

        # Create the optimizers for the actor and critic using the corresponding learning rate
        actor_parameters = self.actor.parameters()
        critic_parameters = self.critic.parameters()

        self.actor_optim = opt.Adam(actor_parameters, lr=self.actor_lr)
        self.critic_optim = opt.Adam(critic_parameters, lr=self.critic_lr)

        # Initialize a random exploration noise
        self.random_noise = random_process.OrnsteinUhlenbeckActionNoise(
            self.num_actions)
示例#10
0
    def __init__(
        self,
        state_dim,
        action_dim,
        hidden_dim,
        actor,
        critic,
        value_network,
        target_value_network,
        polyak_constant,
        actor_learning_rate,
        critic_learning_rate,
        value_learning_rate,
        num_q_value,
        num_v_value,
        batch_size,
        gamma,
        random_seed,
        num_epochs,
        num_rollouts,
        num_eval_rollouts,
        env,
        eval_env,
        nb_train_steps,
        max_episodes_per_epoch,
        output_folder,
        use_cuda,
        buffer_capacity,
        policy_reg_mean_weight=1e-3,
        policy_reg_std_weight=1e-3,
        policy_preactivation_weight=0,
        verbose=True,
        plot_stats=False,
    ):

        self.state_dim = state_dim
        self.action_dim = action_dim
        self.hidden = hidden_dim
        self.q_dim = num_q_value
        self.v_dim = num_v_value
        self.actor = actor
        self.critic = critic
        self.value = value_network
        self.tau = polyak_constant
        self.bs = batch_size
        self.gamma = gamma
        self.seed = random_seed
        self.use_cuda = use_cuda
        self.buffer = Buffer.ReplayBuffer(capacity=buffer_capacity,
                                          seed=self.seed)
        self.policy_reg_mean_weight = policy_reg_mean_weight
        self.policy_reg_std_weight = policy_reg_std_weight
        self.policy_pre_activation_weight = policy_preactivation_weight

        # Training specific parameters
        self.num_epochs = num_epochs
        self.num_rollouts = num_rollouts
        self.num_eval_rollouts = num_eval_rollouts
        self.env = env
        self.eval_env = eval_env
        self.nb_train_steps = nb_train_steps
        self.max_episodes_per_epoch = max_episodes_per_epoch
        self.statistics = defaultdict(float)
        self.combined_statistics = defaultdict(list)
        self.verbose = verbose
        self.output_folder = output_folder
        self.plot_stats = plot_stats

        self.actor_optim = optim.Adam(lr=actor_learning_rate,
                                      params=self.actor.parameters())
        self.critic_optim = optim.Adam(lr=critic_learning_rate,
                                       params=self.critic.parameters())
        self.value_optim = optim.Adam(lr=value_learning_rate,
                                      params=self.value.parameters())

        self.target_value = target_value_network

        if self.use_cuda:
            self.actor = self.actor.cuda()
            self.critic = self.critic.cuda()
            self.value = self.value.cuda()
            self.target_value = self.target_value.cuda()

        # Initializing the target networks with the standard network weights
        self.target_value.load_state_dict(self.value.state_dict())

        # Initialize a random exploration noise
        self.random_noise = random_process.OrnsteinUhlenbeckActionNoise(
            self.action_dim)
示例#11
0
    def train(self):
        epoch_episode_rewards = []

        # Initialize the training with an initial state
        state = self.env.reset()

        # Initialize the losses
        episode_reward = 0
        # Check whether to use cuda or not
        state = to_tensor(state, use_cuda=self.use_cuda)

        fwd_loss = 0
        stats_loss = 0
        policy_loss = 0

        # Mean rewards
        mean_rewards = []
        with torch.no_grad():
            state = self.encoder(state)
        state = state.detach()

        for frame_idx in range(1, self.num_frames + 1):
            epsilon_by_frame = epsilon_greedy_exploration()
            epsilon = epsilon_by_frame(frame_idx)
            action = self.policy_network.act(state, epsilon)

            # Execute the action
            next_state, reward, done, success = self.env.step(action.item())
            episode_reward += reward

            reward = np.sign(reward)

            next_state = to_tensor(next_state, use_cuda=self.use_cuda)
            with torch.no_grad():
                next_state = self.encoder(next_state)

            next_state = next_state.detach()

            reward = torch.tensor([reward], dtype=torch.float)

            done_bool = done * 1
            done_bool = torch.tensor([done_bool], dtype=torch.float)

            # Store in the replay buffer
            self.store_transition(state=state,
                                  new_state=next_state,
                                  action=action,
                                  done=done_bool,
                                  reward=reward)

            state = next_state

            if done:
                epoch_episode_rewards.append(episode_reward)
                # Add episode reward to tensorboard
                episode_reward = 0
                state = self.env.reset()
                state = to_tensor(state, use_cuda=self.use_cuda)
                state = self.encoder(state)

            # Train the forward dynamics model
            if len(self.replay_buffer) > self.fwd_limit:
                # Sample a minibatch from the replay buffer
                transitions = self.replay_buffer.sample_batch(self.batch_size)
                batch = Buffer.Transition(*zip(*transitions))
                batch = self.get_train_variables(batch)
                mse_loss = self.train_forward_dynamics(batch=batch)
                fwd_loss += mse_loss.item()
                if frame_idx % self.print_every == 0:
                    print('Forward Dynamics Loss :',
                          fwd_loss / (frame_idx - self.fwd_limit))

            # Train the statistics network and the policy
            if len(self.replay_buffer) > self.policy_limit:
                transitions = self.replay_buffer.sample_batch(self.batch_size)
                batch = Buffer.Transition(*zip(*transitions))
                batch = self.get_train_variables(batch)
                loss, aug_rewards = self.train_statistics_network(batch=batch)

                p_loss = self.train_policy(batch=batch, rewards=aug_rewards)

                stats_loss += loss.item()
                policy_loss += p_loss.item()

                if frame_idx % self.print_every == 0:
                    print('Statistics Loss: ',
                          stats_loss / (frame_idx - self.policy_limit))
                    print('Policy Loss: ',
                          policy_loss / (frame_idx - self.policy_limit))

            # Print the statistics
            if self.verbose:
                if frame_idx % self.print_every == 0:
                    print('Mean Reward ', str(np.mean(epoch_episode_rewards)))
                    print('Sum of Rewards ',
                          str(np.sum(epoch_episode_rewards)))
                    mean_rewards.append(np.mean(epoch_episode_rewards))

            if self.plot_stats:
                if frame_idx % self.plot_every == 0:
                    # Plot the statistics calculated
                    self.plot(frame_idx=frame_idx,
                              rewards=epoch_episode_rewards,
                              mean_rewards=mean_rewards,
                              output_folder=self.output_folder,
                              placeholder_name='/DQN_montezuma_intrinsic')

            # Update the target network
            if frame_idx % self.update_every == 0:
                self.update_networks()

            # Save the models and the rewards file
            if frame_idx % self.save_epoch == 0:
                self.save_m()
                self.save_rewards(ep_rewards=epoch_episode_rewards,
                                  mean_rewards=mean_rewards)

        self.save_m()
示例#12
0
    def __init__(self,
                 env,
                 encoder,
                 forward_dynamics,
                 statistics_network,
                 target_policy_network,
                 policy_network,
                 forward_dynamics_lr,
                 stats_lr,
                 policy_lr,
                 num_train_epochs,
                 num_frames,
                 num_fwd_train_steps,
                 num_stats_train_steps,
                 fwd_dynamics_limit,
                 stats_network_limit,
                 policy_limit,
                 size_replay_buffer,
                 random_seed,
                 polyak_constant,
                 discount_factor,
                 batch_size,
                 action_space,
                 model_output_folder,
                 save_epoch,
                 target_stats_network=None,
                 target_fwd_dynamics_network=None,
                 clip_rewards=True,
                 clip_augmented_rewards=False,
                 print_every=2000,
                 update_network_every=2000,
                 plot_every=5000,
                 intrinsic_param=0.01,
                 non_episodic_intrinsic=True,
                 use_mine_formulation=True,
                 use_cuda=False,
                 save_models=True,
                 plot_stats=False,
                 verbose=True):

        self.encoder = encoder
        self.fwd = forward_dynamics
        self.stats = statistics_network
        self.use_cuda = use_cuda
        self.policy_network = policy_network
        self.target_policy_network = target_policy_network
        self.output_folder = model_output_folder
        self.use_mine_formulation = use_mine_formulation
        self.env = env
        self.train_epochs = num_train_epochs
        self.num_frames = num_frames
        self.num_fwd_train_steps = num_fwd_train_steps
        self.num_stats_train_steps = num_stats_train_steps
        self.fwd_lr = forward_dynamics_lr
        self.stats_lr = stats_lr
        self.policy_lr = policy_lr
        self.random_seed = random_seed
        self.save_models = save_models
        self.plot_stats = plot_stats
        self.verbose = verbose
        self.intrinsic_param = intrinsic_param
        self.save_epoch = save_epoch
        self.clip_rewards = clip_rewards
        self.clip_augmented_rewards = clip_augmented_rewards
        self.max = torch.zeros(1)
        self.min = torch.zeros(1)

        self.fwd_limit = fwd_dynamics_limit
        self.stats_limit = stats_network_limit
        self.policy_limit = policy_limit

        self.print_every = print_every
        self.update_every = update_network_every
        self.plot_every = plot_every
        self.non_episodic = non_episodic_intrinsic

        self.statistics = defaultdict(float)
        self.combined_statistics = defaultdict(list)

        self.target_stats_network = target_stats_network
        self.target_fwd_dynamics_network = target_fwd_dynamics_network

        # Fix the encoder weights
        for param in self.encoder.parameters():
            param.requires_grad = False

        self.replay_buffer = Buffer.ReplayBuffer(capacity=size_replay_buffer,
                                                 seed=self.random_seed)

        self.tau = polyak_constant
        self.gamma = discount_factor
        self.batch_size = batch_size
        self.action_space = action_space

        torch.manual_seed(self.random_seed)
        if self.use_cuda:
            torch.cuda.manual_seed(self.random_seed)

        if self.use_cuda:
            self.encoder = self.encoder.cuda()
            self.invd = self.invd.cuda()
            self.fwd = self.fwd.cuda()
            self.policy_network = self.policy_network.cuda()
            self.source_distribution = self.source_distribution.cuda()

        self.fwd_optim = optim.Adam(params=self.fwd.parameters(),
                                    lr=self.fwd_lr)
        self.policy_optim = optim.Adam(params=self.policy_network.parameters(),
                                       lr=self.policy_lr)
        self.stats_optim = optim.Adam(params=self.stats.parameters(),
                                      lr=self.stats_lr)
        # Update the policy and target policy networks
        self.update_networks()
示例#13
0
    def __init__(self,
                 env,
                 encoder,
                 inverse_dynamics,
                 forward_dynamics,
                 source_distribution,
                 statistics_network,
                 target_policy_network,
                 policy_network,
                 encoder_lr,
                 inverse_dynamics_lr,
                 forward_dynamics_lr,
                 source_d_lr,
                 stats_lr,
                 policy_lr,
                 num_train_epochs,
                 num_epochs,
                 num_rollouts,
                 size_replay_buffer,
                 size_dqn_replay_buffer,
                 random_seed,
                 polyak_constant,
                 discount_factor,
                 batch_size,
                 action_space,
                 observation_space,
                 model_output_folder,
                 train_encoder=False,
                 use_mine_formulation=True,
                 use_cuda=False):

        self.encoder = encoder
        self.invd = inverse_dynamics
        self.fwd = forward_dynamics
        self.source = source_distribution
        self.stats = statistics_network
        self.use_cuda = use_cuda
        self.policy_network = policy_network
        self.target_policy_network = target_policy_network
        self.model_output_folder = model_output_folder
        self.use_mine_formulation = use_mine_formulation
        self.env = env
        self.num_epochs = num_epochs
        self.train_epochs = num_train_epochs
        self.num_rollouts = num_rollouts
        self.e_lr = encoder_lr
        self.invd_lr = inverse_dynamics_lr
        self.fwd_lr = forward_dynamics_lr
        self.source_lr = source_d_lr
        self.stats_lr = stats_lr
        self.policy_lr = policy_lr
        self.random_seed = random_seed
        self.replay_buffer = Buffer.ReplayBuffer(capacity=size_replay_buffer,
                                                 seed=self.random_seed)
        self.dqn_replay_buffer = Buffer.ReplayBuffer(capacity=size_dqn_replay_buffer,
                                                     seed=self.random_seed)
        self.tau = polyak_constant
        self.gamma = discount_factor
        self.batch_size = batch_size
        self.action_space = action_space
        self.obs_space = observation_space

        torch.manual_seed(self.random_seed)
        if self.use_cuda:
            torch.cuda.manual_seed(self.random_seed)

        if self.use_cuda:
            self.encoder = self.encoder.cuda()
            self.invd = self.invd.cuda()
            self.fwd = self.fwd.cuda()
            self.policy_network = self.policy_network.cuda()
            self.source_distribution = self.source_distribution.cuda()

        # Define the optimizers
        if train_encoder:
            self.e_optim = optim.Adam(params=self.encoder.parameters(), lr=self.e_lr)
        self.invd_optim = optim.Adam(params=self.invd.parameters(), lr=self.invd_lr)
        self.fwd_optim = optim.Adam(params=self.fwd.parameters(), lr=self.fwd_lr)
        self.policy_optim = optim.Adam(params=self.policy_network.parameters(), lr=self.policy_lr)
        self.source_optim = optim.Adam(params=self.source_distribution.parameters(), lr=self.source_lr)
        self.stats_optim = optim.Adam(params=self.stats.parameters(), lr=self.stats_lr)
class PPO(object):
    def __init__(self,
                 actions,
                 device,
                 lr=1e-2,
                 gamma=0.99,
                 ppo_clip=0.2,
                 ppo_epoch=5,
                 batch_size=8):
        super(PPO).__init__()
        self.device = device
        self.policy_new = ActorCritic(actions)
        self.policy_old = ActorCritic(actions)
        self.policy_new.apply(init_weights)
        self.policy_old.apply(init_weights)
        self.optimizer = optim.Adam(self.policy_new.parameters(), lr=lr)
        self.policy_old.load_state_dict(self.policy_new.state_dict())
        self.ppo_clip = ppo_clip
        self.ppo_epoch = ppo_epoch
        self.batch_size = batch_size
        self.memory = Buffer(device)
        self.max_timestamp_per_episode = 1000
        self.gamma = gamma
        self.value_loss_coef = 0.5
        self.MseLoss = nn.MSELoss()

    def get_action_and_prob(self, state):
        state = torch.FloatTensor(state).unsqueeze(0).to(self.device)
        with torch.no_grad():
            action_predictions, _ = self.policy_old(state)
        action_probabilities = F.softmax(action_predictions, dim=-1)
        distributions = Categorical(action_probabilities)
        action = distributions.sample()
        action_log_prob = distributions.log_prob(action)
        return action, action_log_prob

    def evaluate_policy(self, states, actions):
        action_predictions, value_pred = self.policy_new(states)
        action_probabilities = F.softmax(action_predictions, dim=-1)
        distributions = Categorical(action_probabilities)
        action_log_probs = distributions.log_prob(actions)
        return action_log_probs, value_pred

    def calculate_rewards(self, old_rewards):

        rewards = []
        discounted_reward = 0
        for reward in reversed(old_rewards):
            discounted_reward = reward + (self.gamma * discounted_reward)
            rewards.append(discounted_reward)
        rewards.reverse()
        rewards = torch.tensor(rewards).to(self.device)
        rewards = (rewards - rewards.mean()) / (rewards.std() + 1e-5)
        return rewards

    def calculate_advantage(self, discounted_rewards, critic_rewards):
        advantages = discounted_rewards - critic_rewards
        advantages = (advantages - advantages.mean()) / advantages.std()
        return advantages

    def run_episode(self, env, train):
        state = env.reset()
        total_rewards = 0
        loss = 0
        for i in range(self.max_timestamp_per_episode):
            if not train:
                env.render()
            action, action_log_prob = self.get_action_and_prob(state)
            next_state, reward, done, _ = env.step(action.item())
            total_rewards += reward
            if train:
                self.memory.add_transition(state, action, reward,
                                           action_log_prob, done)
            state = next_state
            if done:
                break
        if train:
            loss = self.train_policy()
            self.memory.clear()
        return total_rewards, loss

    def train_policy(self):

        old_states, old_actions, old_probs, rewards, _ = self.memory.get_transitions(
        )
        rewards = self.calculate_rewards(rewards)
        epoch_loss = []
        for epoch in range(self.ppo_epoch):
            probs, value_pred = self.evaluate_policy(old_states, old_actions)
            ratio = (probs - old_probs).exp()
            advantages = self.calculate_advantage(rewards, value_pred.detach())
            surr1 = ratio * advantages
            surr2 = torch.clamp(ratio,
                                min=1.0 - self.ppo_clip,
                                max=1.0 + self.ppo_clip) * advantages
            self.optimizer.zero_grad()
            loss = (-torch.min(surr1, surr2) + self.value_loss_coef *
                    F.smooth_l1_loss(value_pred, rewards)).mean()
            loss.backward()
            self.optimizer.step()
            epoch_loss.append(loss.item())

        self.policy_old.load_state_dict(self.policy_new.state_dict())

        return np.mean(epoch_loss)
示例#15
0
    def train_statistics_network(self,
                                 use_jenson_shannon_divergence=True,
                                 use_target_forward_dynamics=False,
                                 use_target_stats_network=False,
                                 clamp_gradients=False):

        if self.replay_buffer.get_buffer_size() < self.batch_size:
            return None, None, None

        transitions = self.replay_buffer.sample_batch(self.batch_size)
        batch = Buffer.Transition(*zip(*transitions))

        # Get the separate values from the named tuple
        states = batch.state
        new_states = batch.next_state
        actions = batch.action
        rewards = batch.reward
        dones = batch.done

        states = Variable(torch.cat(states))
        new_states = Variable(torch.cat(new_states), requires_grad=False)
        actions = Variable(torch.cat(actions))
        rewards = Variable(torch.cat(rewards))
        dones = Variable(torch.cat(dones))

        if self.use_cuda:
            states = states.cuda()
            actions = actions.cuda()
            rewards = rewards.cuda()
            new_states = new_states.cuda()
            dones = dones.cuda()

        all_actions = self.get_all_actions(self.action_space)
        all_actions = Variable(torch.cat(all_actions))

        new_state_marginals = []
        for state in states:
            state = state.expand(self.action_space, -1)
            if use_target_forward_dynamics:
                n_s = self.target_fwd(state, all_actions)
            else:
                n_s = self.fwd(state, all_actions)
            n_s = n_s.detach()
            n_s = n_s + state
            n_s = torch.mean(n_s, dim=0)
            n_s = torch.unsqueeze(n_s, dim=0)
            new_state_marginals.append(n_s)

        new_state_marginals = tuple(new_state_marginals)
        new_state_marginals = Variable(torch.cat(new_state_marginals),
                                       requires_grad=False)

        p_sa = self.stats(new_states, actions)
        p_s_a = self.stats(new_state_marginals, actions)

        p_s_ta = self.target_stats(new_states, actions)
        p_s_t_a = self.target_stats(new_state_marginals, actions)

        if use_jenson_shannon_divergence:
            # Improves stability and gradients are unbiased
            if use_target_stats_network:
                mutual_information = -F.softplus(-p_s_ta) - F.softplus(p_s_t_a)
            else:
                mutual_information = -F.softplus(-p_sa) - F.softplus(p_s_a)
            lower_bound = torch.mean(-F.softplus(-p_sa)) - torch.mean(
                F.softplus(p_s_a))
        else:
            # Use KL Divergence
            if use_target_stats_network:
                mutual_information = p_s_ta - torch.log(torch.exp(p_s_t_a))
            else:
                mutual_information = p_sa - torch.log(torch.exp(p_s_a))
            lower_bound = torch.mean(p_sa) - torch.log(
                torch.mean(torch.exp(p_s_a)))

        # Maximize the mutual information
        loss = -lower_bound
        self.stats_optim.zero_grad()
        loss.backward()
        # Clamp the gradients
        if clamp_gradients:
            for param in self.stats.parameters():
                param.grad.data.clamp_(-1, 1)
        self.stats_optim.step()

        # Store in the dqn replay buffer

        mutual_information = torch.squeeze(mutual_information, dim=-1)
        mutual_information = mutual_information.detach()

        rewards_combined = rewards + self.intrinsic_param * mutual_information
        # Store the updated reward transition in the replay buffer
        self.store_transition(state=states,
                              action=actions,
                              new_state=new_states,
                              reward=rewards_combined,
                              done=dones,
                              buffer=self.dqn_replay_buffer)

        return loss, rewards, mutual_information, lower_bound