Пример #1
0
 def __init__(self,
              observation_spaces,
              action_spaces,
              shared=False,
              hyperparameters=None,
              name=None):
     name = 'policy_group' if name is None else name
     self.hyperparameters = hyperparameters if hyperparameters else {}
     self.shared = next(iter(observation_spaces.keys())) if shared else None
     if shared:
         shared_obs_space = observation_spaces[self.shared]
         shared_act_space = action_spaces[self.shared]
         shared_policy = Policy(shared_obs_space,
                                shared_act_space,
                                name='shared_policy')
         policies = {}
         for pname, (obs_space,
                     act_space) in zip_map(observation_spaces,
                                           action_spaces):
             assert shared_obs_space == obs_space
             assert shared_act_space == act_space
             policies[pname] = shared_policy
     else:
         policies = {
             name: Policy(obs_space, act_space, name=name)
             for name, (
                 obs_space,
                 act_space) in zip_map(observation_spaces, action_spaces)
         }
     super().__init__(policies, shared=self.shared, name=name)
Пример #2
0
    def _build(self, observations):
        '''
        Build the agents in the group.

        :param observations: (dict) A dictionary of tensors that maps names to
                                    observations.
        :return: (PolicyGroupFunc) A namedtuple containing maps of actions,
                                   target actions, update functions.
        '''

        if self.shared:
            names, observations = list(zip(*list(observations.items())))
            length = len(observations)
            observations = tf.concat(observations, 0)
            policy = self.group[self.shared](observations)
            policies = {name: policy for name in names}
            actions = tf.split(policy.predict, length)
            target_actions = tf.split(policy.predict_target, length)
            entropy = tf.split(policy.entropy, length)
            noisy_target = tf.split(policy.noisy_target, length)
            actions = {name: action for name, action in zip(names, actions)}
            target_actions = {
                name: action
                for name, action in zip(names, target_actions)
            }
            entropy = {name: ent for name, ent in zip(names, entropy)}
            noisy_target = {
                name: target
                for name, target in zip(names, noisy_target)
            }
            update = policy.update_target
        else:
            policies = {
                name: policy(obs)
                for name, (policy, obs) in zip_map(self.group, observations)
            }
            actions = {}
            target_actions = {}
            entropy = {}
            noisy_target = {}
            update = []
            policies = {
                name: policy(obs)
                for name, (policy, obs) in zip_map(self.group, observations)
            }
            for name, policy in policies.items():
                actions[name] = policy.predict
                target_actions[name] = policy.predict_target
                entropy[name] = policy.entropy
                noisy_target[name] = policy.noisy_target
                update.append(policy.update_target)
            update = tf.group(*update)
        return PolicyGroupFunc(policies, actions, target_actions, update,
                               entropy, noisy_target)
Пример #3
0
def test_convert_spaces_to_placeholders():
    '''Test convert_spaces_to_placeholders function.'''
    spaces = {str(i): Box(0, 1, shape=(4, )) for i in range(10)}
    placeholders = utils.convert_spaces_to_placeholders(spaces, False)
    assert spaces.keys() == placeholders.keys()
    for _, (space, placeholder) in utils.zip_map(spaces, placeholders):
        assert list(space.shape) == placeholder.shape.as_list()
Пример #4
0
 def compute_qvalue(self, observations, actions, rewards, dones, gamma):
     '''Compute the Q value.'''
     target = self.critic_group(observations, actions).target_values
     return {
         name: tf.stop_gradient(R + gamma * (1. - D) * Q)
         for name, (Q, R, D) in zip_map(target, rewards, dones)
     }
Пример #5
0
 def __init__(self, observation_spaces, action_spaces, shared=False,
              hyperparameters=None, name=None):
     name = 'critic_group' if name is None else name
     self.hyperparameters = hyperparameters if hyperparameters else {}
     self.shared = next(iter(observation_spaces.keys())) if shared else None
     if shared:
         obs_space = observation_spaces[self.shared]
         act_space = action_spaces[self.shared]
         shared_critic = Critic(obs_space, act_space, name='shared_critic')
         critics = {}
         for key, (obs, act) in zip_map(observation_spaces, action_spaces):
             assert obs_space == obs
             assert act_space == act
             critics[key] = shared_critic
     else:
         critics = {key: Critic(obs, act, name=key)
                    for key, (obs, act) in
                    zip_map(observation_spaces, action_spaces)}
     super().__init__(critics, shared=self.shared, name=name)
Пример #6
0
 def _encode_sample(self, idxes):
     obses_t = defaultdict(list)
     actions = defaultdict(list)
     rewards = defaultdict(list)
     obses_tp1 = defaultdict(list)
     dones = defaultdict(list)
     for i in idxes:
         data = self._storage[i]
         obs_t, action, reward, obs_tp1, done = data
         for key, (obs_t, action, reward, obs_tp1, done) in zip_map(*data):
             obses_t[key].append(obs_t)
             actions[key].append(action)
             rewards[key].append(reward)
             obses_tp1[key].append(obs_tp1)
             dones[key].append(done)
     return obses_t, actions, rewards, obses_tp1, dones
Пример #7
0
 def create_optimizers(self, values):
     '''Create optimizers from the group.'''
     losses = {}
     opts = {}
     learning_rate = self.hyperparameters.get('learning_rate', 1e-4)
     if self.shared:
         policy = self.group[self.shared]
         values = values[self.shared]
         opts, loss = policy.create_optimizer(values,
                                              learning_rate=learning_rate)
         losses = {name: loss for name in self.group}
     else:
         for name, (policy, value) in zip_map(self.group, values):
             opts[name], losses[name] = policy.create_optimizer(
                 value, learning_rate=learning_rate)
         opts = tf.group(*list(opts.values()))
     return opts, losses
Пример #8
0
    def _build(self, observation):
        '''
        Build the networks needed for the MADDPG.

        :param obs: (tensorflow.Tensor) Tensor of observations.
        :return: (MaddpgFunc) A tuple of functions used for evaluating
                              and training.
        '''
        if self.normalize.get('observation'):
            observation = {
                key: norm(obs, False)
                for key, (obs, norm) in zip_map(observation,
                                                self.normalize['observation'])
            }

        policies = self.policy_group(observation)
        predict = policies.actions
        return ComaFunc(None, None, None, None, predict, None, None, None)
Пример #9
0
    def _build(self, observations, actions):
        '''
        Build the agents in the group.

        '''
        if self.shared:
            observations = sorted(list(observations.items()),
                                  key=lambda x: x[0])
            actions = sorted(list(actions.items()),
                             key=lambda x: x[0])
            names, observations = list(zip(*observations))
            _, actions = list(zip(*actions))
            length = len(observations)
            observations = tf.concat(observations, 0)
            actions = tf.concat(actions, 0)
            critic = self.group[self.shared](observations, actions)
            critics = {name: critic for name in names}
            values = tf.split(critic.predict, length)
            target_values = tf.split(critic.predict_target, length)
            values = {name: value for name, value in zip(names, values)}
            target_values = {name: value
                             for name, value in zip(names, target_values)}
            #update = {name: policy.update_target for name in self.group}
            update = critic.update_target
        else:
            critics = {name: critic(obs, act) for name, (critic, obs, act) in
                       zip_map(self.group, observations, actions)}
            values = {}
            target_values = {}
            update = []
            for name, critic in critics.items():
                values[name] = critic.predict
                target_values[name] = critic.predict_target
                update.append(critic.update_target)
            update = tf.group(*update)
        return CriticGroupFunc(critics, values, target_values, update)
Пример #10
0
def test_map_zip_all_equal():
    '''Test map_zip function.'''
    mappings = [{i: i for i in range(10)} for j in range(10)]
    for ke_y, values in utils.zip_map(*mappings):
        assert len(set(values)) == 1
Пример #11
0
def test_map_zip_all_descending():
    '''Test map_zip function.'''
    mappings = [{i: i for i in range(10 - j)} for j in range(10)]
    with pytest.raises(KeyError):
        for _, values in utils.zip_map(*mappings):
            assert len(set(values)) == 1
Пример #12
0
def test_map_zip_all_ascending():
    '''Test map_zip function.'''
    mappings = [{i: i for i in range(10 + j)} for j in range(10)]
    for _, values in utils.zip_map(*mappings):
        assert len(set(values)) == 1
Пример #13
0
def main(batch_size=1):
    multi_env = MultiOptLRs(data_set='mnist', max_batches=100,
                            batch_size=128, max_history=25)
    agents = Maddpg(multi_env.observation_space,
                    multi_env.action_space,
                    shared_policy=True, shared_critic=True)
    print_tqdm('Starting...')
    exp_replay = {name: ReplayBuffer(1e6)
                  for name in multi_env.action_space.spaces}
    global_step = 0
    last_info = defaultdict(lambda: None)
    for _ in trange(60000):
        total_reward = 0
        states_last = states = multi_env.reset()
        done = False
        all_actions = []
        while not done:
            actions = agents.predict(states)
            actions = {key: np.squeeze(act) for key, act in actions.items()}
            states, reward, done, info = multi_env.step(actions)
            
            if done:
                last_info = info
            #print(np.any(np.isnan(list(actions.values()))))
            total_reward += reward
            rewards = {key: reward for key in states}
            dones = {key: done for key in states}
            all_results = states_last, actions, rewards, states, dones
            all_results = {
                name: values for name, values in zip_map(*all_results)
            }

            for key, (replay, results) in zip_map(exp_replay, all_results):
                replay.add(*results)

            if global_step > batch_size and global_step % 100 == 0:
                states_feed = {}
                actions_feed = {}
                rewards_feed = {}
                states_n_feed = {}
                dones_feed = {}
                idxs = {}
                for key, replay in exp_replay.items():
                    #idx, mem, _ = replay.sample(1024)
                    #idxs[key] = idx
                    #stat, actio, rewar, stat_n, don = mem
                    stat, actio, rewar, stat_n, don = replay.sample(batch_size)
                    states_feed[key] = stat
                    actions_feed[key] = actio
                    rewards_feed[key] = rewar
                    states_n_feed[key] = stat_n
                    dones_feed[key] = don
                loss_before = agents.compute_loss(states_feed, actions_feed,
                                                  rewards_feed, states_n_feed,
                                                  dones_feed)
                losses = agents.train_step(states_feed, actions_feed,
                                           rewards_feed, states_n_feed,
                                           dones_feed)
                losses = agents.compute_loss(states_feed, actions_feed,
                                             rewards_feed, states_n_feed,
                                             dones_feed)
                agents.update_targets()

                actor_loss_before = np.mean(
                    list(loss_before['actor'].values()))
                critic_loss_before = np.mean(
                    list(loss_before['critic'].values()))
                actor_loss = np.mean(list(losses['actor'].values()))
                critic_loss = np.mean(list(losses['critic'].values()))
                all_actions = [list(act.values()) for act in all_actions]
                print_tqdm('*'*80)
                print_tqdm('Training:')
                print_tqdm('Total Reward:', total_reward)
                print_tqdm('Stats:', last_info['episode'])
                print_tqdm('Grads Sum:', last_info['grads_sum'])
                print_tqdm('Action Mean:', last_info['actions_mean'])
                print_tqdm('Action Std:', last_info['actions_std'])
                print_tqdm('Network Loss:',  last_info['loss'])
                print_tqdm('Network Accu:', last_info['accuracy'])
                print_tqdm('Actor Loss Before:', actor_loss_before)
                print_tqdm('Critic Loss Before:', critic_loss_before)
                print_tqdm('Actor Loss:', actor_loss)
                print_tqdm('Critic Loss:', critic_loss)
                print_tqdm('*'*80)

            states_last = states
            global_step += 1
            all_actions.append(actions)
    agents.save('optimizer/model.ckpt')
Пример #14
0
    def _build(self,
               observation,
               actions,
               rewards,
               observation_n,
               dones,
               gamma=0.95):
        '''
        Build the networks needed for the MADDPG.

        :param obs: (tensorflow.Tensor) Tensor of observations.
        :param rewards: (tensorflow.Tensor) Tensor of rewards.
        :param dones: (tensorflow.Tensor) Tensor of boolean like values that
                                          denote whether an episode completed
                                          such that if the ith done in dones
                                          is 1 then the ith step was the last
                                          step.
        :param gamma: (float) The gamma value to use.
        :return: (MaddpgFunc) A tuple of functions used for evaluating
                              and training.
        '''
        if self.normalize.get('observation'):
            observation = {
                key: norm(obs, False)
                for key, (obs, norm) in zip_map(observation,
                                                self.normalize['observation'])
            }
            observation_n = {
                key: norm(obs, False)
                for key, (obs, norm) in zip_map(observation_n,
                                                self.normalize['observation'])
            }
        if self.normalize.get('reward'):
            rewards = {
                key: norm(rew, False)
                for key, (rew,
                          norm) in zip_map(rewards, self.normalize['reward'])
            }
        obs_n_concat = U.concat_map(observation_n)
        obs_n_concat = {name: obs_n_concat for name in observation}
        global_critics = self.global_critic_group
        worst_qactions = self.worst_policy_group(observation_n).actions
        worst_qactions = U.concat_map(worst_qactions)
        worst_qactions = {name: worst_qactions for name in observation}
        worst_qvalues = global_critics(obs_n_concat,
                                       worst_qactions).target_values

        best_qactions = self.best_policy_group(observation_n).actions
        best_qactions = U.concat_map(best_qactions)
        best_qactions = {name: best_qactions for name in observation}
        best_qvalues = self.compute_global_qvalue(obs_n_concat, best_qactions,
                                                  rewards, dones, gamma)
        all_actions = U.concat_map(actions)
        all_actions = {name: all_actions for name in self.action_spaces}
        obs_concat = U.concat_map(observation)
        obs_concat = {name: obs_concat for name in observation}

        global_values = global_critics(obs_concat, all_actions).values

        global_opts = global_critics.create_optimizers(global_values,
                                                       best_qvalues)

        personal_reward = {
            name: tf.stop_gradient(gval - wval)
            for name, (gval, wval) in zip_map(global_values, worst_qvalues)
        }
        personal_critics = self.personal_critic_group
        personal_values = personal_critics(obs_concat, all_actions).values
        personal_qvalue = self.compute_personal_qvalue(obs_n_concat,
                                                       best_qactions,
                                                       personal_reward, dones,
                                                       gamma)
        personal_critic = personal_critics.create_optimizers(
            personal_values, personal_qvalue)

        predict = self.best_policy_group(observation).actions
        all_actions = U.concat_map(predict)
        all_actions = {name: all_actions for name in self.action_spaces}
        target_vals = personal_critics(obs_concat, all_actions).target_values

        worst_predict = self.worst_policy_group(observation).actions
        worst_predict = U.concat_map(worst_predict)
        worst_predict = {name: worst_predict for name in self.action_spaces}
        worst_vals = personal_critics(obs_concat, worst_predict).target_values
        worst_vals = {name: -v for name, v in worst_vals.items()}

        best_policy = self.best_policy_group.create_optimizers(target_vals)
        worst_policy = self.worst_policy_group.create_optimizers(worst_vals)

        critic_opts = [global_opts[0], personal_critic[0]]
        critic_losses = [global_opts[1], personal_critic[1]]
        #po_opts = [best_policy[0], worst_policy[0]]
        #po_losses = [best_policy[1], worst_policy[1]]
        po_opts = [best_policy[0], worst_policy[0]]
        po_losses = [best_policy[1], worst_policy[1]]

        critic_opts = tf.group(critic_opts)
        critic_losses = {
            name: tf.reduce_mean(tf.stack(losses, -1), -1)
            for name, losses in zip_map(*critic_losses)
        }
        update_critic = tf.group([
            global_critics.update_targets(5e-3),
            self.personal_critic_group.update_targets(5e-3)
        ])

        po_opts = tf.group(po_opts)
        po_losses = {
            name: tf.math.reduce_std(tf.stack(losses, -1), -1)
            for name, losses in zip_map(*po_losses)
        }
        update_policy = tf.group([
            self.worst_policy_group.update_targets(5e-3),
            self.best_policy_group.update_targets(5e-3)
        ])

        return ComaFunc(po_opts, critic_opts, po_losses, critic_losses,
                        predict, target_vals, update_policy, update_critic)
Пример #15
0
    def _build(self,
               observation,
               actions,
               rewards,
               observation_n,
               dones,
               gamma=0.95):
        '''
        Build the networks needed for the MADDPG.

        :param obs: (tensorflow.Tensor) Tensor of observations.
        :param rewards: (tensorflow.Tensor) Tensor of rewards.
        :param dones: (tensorflow.Tensor) Tensor of boolean like values that
                                          denote whether an episode completed
                                          such that if the ith done in dones
                                          is 1 then the ith step was the last
                                          step.
        :param gamma: (float) The gamma value to use.
        :return: (MaddpgFunc) A tuple of functions used for evaluating
                              and training.
        '''
        if self.normalize.get('observation'):
            observation = {
                key: norm(obs, False)
                for key, (obs, norm) in zip_map(observation,
                                                self.normalize['observation'])
            }
            observation_n = {
                key: norm(obs, False)
                for key, (obs, norm) in zip_map(observation_n,
                                                self.normalize['observation'])
            }
        if self.normalize.get('reward'):
            rewards = {
                key: norm(rew, False)
                for key, (rew,
                          norm) in zip_map(rewards, self.normalize['reward'])
            }
        obs_n_concat = U.concat_map(observation_n)
        obs_n_concat = {name: obs_n_concat for name in observation}
        qactions = self.policy_group(observation_n).target_actions
        qactions = U.concat_map(qactions)
        qactions = {name: qactions for name in observation}
        # qvalues = self.compute_qvalue(observation_n, qactions, rewards,
        #  dones, gamma)
        qvalues = self.compute_qvalue(obs_n_concat, qactions, rewards, dones,
                                      gamma)
        actions = U.concat_map(actions)
        actions = {name: actions for name in self.action_spaces}
        obs_concat = U.concat_map(observation)
        obs_concat = {name: obs_concat for name in observation}
        #values = self.critic_group(observation, actions).values
        values = self.critic_group(obs_concat, actions).values
        #cr_opts, cr_losses = self.get_critic_optimizer(values, qvalue)
        critic_opts, critic_losses = self.critic_group.create_optimizers(
            values, qvalues)
        predict = self.policy_group(observation).actions
        actions = U.concat_map(predict)
        actions = {name: actions for name in self.action_spaces}
        #target_vals = self.critic_group(observation, actions).target_values
        target_vals = self.critic_group(obs_concat, actions).target_values
        po_opts, po_losses = self.get_policy_optimizer(target_vals)
        update_critic = self.critic_group.update_targets(5e-3)
        return MaddpgFunc(po_opts, critic_opts, po_losses, critic_losses,
                          predict, target_vals,
                          self.policy_group.update_targets(5e-3),
                          update_critic)