Exemplo n.º 1
0
class CoopActionOtherDDPG(Agent):  # Two Agents, who can measure the output of the other (Based on Keras-rl agent impl.)

    def forward(self, observation):
        raise NotImplementedError

    def backward(self, reward, terminal):
        raise NotImplementedError

    def load_weights(self, filepath):
        raise NotImplementedError

    def save_weights(self, filepath, overwrite=False):
        raise NotImplementedError

    @property
    def layers(self):
        raise NotImplementedError

    def __init__(self, nb_actions, actor1, actor2, critic1, critic2, critic_action_input1, critic_action_input2,
                 memory1, memory2,
                 gamma=.99, batch_size=32, nb_steps_warmup_critic=1000, nb_steps_warmup_actor=1000,
                 train_interval=1, memory_interval=1, delta_range=None, delta_clip=np.inf,
                 random_process1=None, random_process2=None, custom_model_objects={}, target_model_update=.001,
                 **kwargs):

        super(CoopActionOtherDDPG, self).__init__()

        self.agent1 = DDPGAgent(nb_actions, actor1, critic1, critic_action_input1, memory1, gamma, batch_size,
                                nb_steps_warmup_critic, nb_steps_warmup_actor, train_interval, memory_interval,
                                delta_range, delta_clip, random_process1, custom_model_objects, target_model_update,
                                **kwargs)
        self.agent2 = DDPGAgent(nb_actions, actor2, critic2, critic_action_input2, memory2, gamma, batch_size,
                                nb_steps_warmup_critic, nb_steps_warmup_actor, train_interval, memory_interval,
                                delta_range, delta_clip, random_process2, custom_model_objects, target_model_update,
                                **kwargs)

    def compile(self, optimizer, metrics=[]):
        self.agent1.compile(clone_optimizer(optimizer), deepcopy(metrics))
        self.agent2.compile(clone_optimizer(optimizer), deepcopy(metrics))

    def fit(self, env, nb_steps, action_repetition=1, callbacks=None, verbose=1,
            visualize=False, nb_max_start_steps=0, start_step_policy=None, log_interval=10000,
            nb_max_episode_steps=None):
        """Trains the agent on the given environment.
        # Arguments
            env: (`Env` instance): Environment that the agent interacts with. See [Env](#env) for details.
            nb_steps (integer): Number of training steps to be performed.
            action_repetition (integer): Number of times the agent repeats the same action without
                observing the environment again. Setting this to a value > 1 can be useful
                if a single action only has a very small effect on the environment.
            callbacks (list of `keras.callbacks.Callback` or `rl.callbacks.Callback` instances):
                List of callbacks to apply during training. See [callbacks](/callbacks) for details.
            verbose (integer): 0 for no logging, 1 for interval logging (compare `log_interval`), 2 for episode logging
            visualize (boolean): If `True`, the environment is visualized during training. However,
                this is likely going to slow down training significantly and is thus intended to be
                a debugging instrument.
            nb_max_start_steps (integer): Number of maximum steps that the agent performs at the beginning
                of each episode using `start_step_policy`. Notice that this is an upper limit since
                the exact number of steps to be performed is sampled uniformly from [0, max_start_steps]
                at the beginning of each episode.
            start_step_policy (`lambda observation: action`): The policy
                to follow if `nb_max_start_steps` > 0. If set to `None`, a random action is performed.
            log_interval (integer): If `verbose` = 1, the number of steps that are considered to be an interval.
            nb_max_episode_steps (integer): Number of steps per episode that the agent performs before
                automatically resetting the environment. Set to `None` if each episode should run
                (potentially indefinitely) until the environment signals a terminal state.
        # Returns
            A `keras.callbacks.History` instance that recorded the entire training process.
        """
        if not (self.agent1.compiled and self.agent2.compiled):
            raise RuntimeError(
                'Your tried to fit your agent but it hasn\'t been compiled yet. Please call `compile()` before `fit()`.')
        if action_repetition < 1:
            raise ValueError('action_repetition must be >= 1, is {}'.format(action_repetition))

        assert self.processor is None  # Removed processors here for simplification. Not needed anyway
        assert nb_max_start_steps == 0  # Removed here for simplification. Not needed anyway
        assert action_repetition == 1  # Removed here for simplification. Not needed anyway

        self.agent1.training = True
        self.agent2.training = True

        experience_for_plotting = deque()

        callbacks = [] if not callbacks else callbacks[:]

        if verbose == 1:
            callbacks += [TrainIntervalLogger(interval=log_interval)]
        elif verbose > 1:
            callbacks += [TrainEpisodeLogger()]
        if visualize:
            callbacks += [Visualizer()]
        history = History()
        callbacks += [history]
        callbacks = CallbackList(callbacks)
        if hasattr(callbacks, 'set_model'):
            callbacks.set_model(self)
        else:
            callbacks._set_model(self)
        callbacks._set_env(env)
        params = {
            'nb_steps': nb_steps,
        }
        if hasattr(callbacks, 'set_params'):
            callbacks.set_params(params)
        else:
            callbacks._set_params(params)
        self.agent1._on_train_begin()
        self.agent2._on_train_begin()
        callbacks.on_train_begin()

        episode = np.int16(0)
        self.agent1.step = np.int16(0)
        self.agent2.step = np.int16(0)
        observation1 = observation2 = None
        episode_reward1 = None
        episode_reward2 = None
        episode_step = None
        did_abort = False
        try:
            while self.agent1.step < nb_steps:  # not individual for now
                if observation1 is None or observation2 is None:  # start of a new episode
                    callbacks.on_episode_begin(episode)
                    episode_step = np.int16(0)
                    episode_reward1 = np.float32(0)
                    episode_reward2 = np.float32(0)

                    # Obtain the initial observation by resetting the environment.
                    self.agent1.reset_states()
                    self.agent2.reset_states()
                    obs = env.reset()
                    observation1 = deepcopy(obs) + (0.,)
                    observation2 = deepcopy(obs) + (0.,)

                # At this point, we expect to be fully initialized.
                assert episode_reward1 is not None
                assert episode_reward2 is not None
                assert episode_step is not None
                assert observation1 is not None
                assert observation2 is not None

                # Run a single step.
                callbacks.on_step_begin(episode_step)
                # This is were all of the work happens. We first perceive and compute the action
                # (forward step) and then use the reward to improve (backward step).
                action1 = np.ndarray.item(self.agent1.forward(observation1))
                action2 = np.ndarray.item(self.agent2.forward(observation2))
                action = (action1, action2)
                reward1 = np.float32(0)
                reward2 = np.float32(0)
                accumulated_info = {}
                done = False

                callbacks.on_action_begin(action)  # Use only one of the actions? added actions?
                obs, r, done, info = env.step(action)
                if done:
                    raise AttributeError  # The episode was reset unexpectedly
                    # (see https://stackoverflow.com/questions/42787924/)

                observation1 = deepcopy(obs) + (info["u2_clipped"],)  # Add action other to the observation
                observation2 = deepcopy(obs) + (info["u1_clipped"],)
                for key, value in info.items():
                    if not np.isreal(value):
                        continue
                    if key not in accumulated_info:
                        accumulated_info[key] = np.zeros_like(value)
                    accumulated_info[key] += value
                callbacks.on_action_end(action)
                reward1 += info["r1"]
                reward2 += info["r2"]

                if nb_max_episode_steps and episode_step >= nb_max_episode_steps - 1:
                    # Force a terminal state.
                    done = True
                metrics1 = self.agent1.backward(reward1, terminal=done)
                metrics2 = self.agent2.backward(reward2, terminal=done)
                episode_reward1 += reward1
                episode_reward2 += reward2

                step_logs = {
                    'action': action[0] + action[1],
                    'observation': observation1,
                    'reward': reward1 + reward2,
                    'metrics': metrics1,  # not individual for now
                    'episode': episode,
                    'info': accumulated_info,
                }
                callbacks.on_step_end(episode_step, step_logs)
                episode_step += 1
                self.agent1.step += 1
                self.agent2.step += 1

                if len(obs) == 2:
                    experience_for_plotting.append((info["t"], obs, (info["u1_clipped"], info["u2_clipped"]), (0., 0.),
                                                    r, (info["r1"], info["r2"])))

                if done:
                    # We are in a terminal state but the agent hasn't yet seen it. We therefore
                    # perform one more forward-backward call and simply ignore the action before
                    # resetting the environment. We need to pass in `terminal=False` here since
                    # the *next* state, that is the state of the newly reset environment, is
                    # always non-terminal by convention.
                    self.agent1.forward(observation1)
                    self.agent2.forward(observation2)
                    self.agent1.backward(0., terminal=False)
                    self.agent2.backward(0., terminal=False)

                    # This episode is finished, report and reset.
                    episode_logs = {
                        'episode_reward': episode_reward1 + episode_reward2,
                        'nb_episode_steps': episode_step,
                        'nb_steps': self.agent1.step,  # not individual for now
                    }
                    callbacks.on_episode_end(episode, episode_logs)

                    episode += 1
                    observation1 = None
                    observation2 = None
                    episode_step = None
                    episode_reward1 = None
                    episode_reward2 = None
        except KeyboardInterrupt:
            # We catch keyboard interrupts here so that training can be be safely aborted.
            # This is so common that we've built this right into this function, which ensures that
            # the `on_train_end` method is properly called.
            did_abort = True
        callbacks.on_train_end(logs={'did_abort': did_abort})
        self.agent1._on_train_end()
        self.agent2._on_train_end()

        return experience_for_plotting
Exemplo n.º 2
0
class CoopDDPG(Agent):  # Two Agents, who can not measure the output of the other (Based on Keras-rl agent impl.)


    def forward(self, observation):
        raise NotImplementedError

    def backward(self, reward, terminal):
        raise NotImplementedError

    def load_weights(self, filepath):
        raise NotImplementedError

    def save_weights(self, filepath, overwrite=False):
        raise NotImplementedError

    @property
    def layers(self):
        raise NotImplementedError

    def __init__(self, nb_actions, actor1, actor2, critic1, critic2, critic_action_input1, critic_action_input2,
                 memory1, memory2,
                 gamma=.99, batch_size=32, nb_steps_warmup_critic=1000, nb_steps_warmup_actor=1000,
                 train_interval=1, memory_interval=1, delta_range=None, delta_clip=np.inf,
                 random_process1=None, random_process2=None, custom_model_objects={}, target_model_update=.001,
                 **kwargs):

        super(CoopDDPG, self).__init__()

        self.agent1 = DDPGAgent(nb_actions, actor1, critic1, critic_action_input1, memory1, gamma, batch_size,
                                nb_steps_warmup_critic, nb_steps_warmup_actor, train_interval, memory_interval,
                                delta_range, delta_clip, random_process1, custom_model_objects, target_model_update,
                                **kwargs)
        self.agent2 = DDPGAgent(nb_actions, actor2, critic2, critic_action_input2, memory2, gamma, batch_size,
                                nb_steps_warmup_critic, nb_steps_warmup_actor, train_interval, memory_interval,
                                delta_range, delta_clip, random_process2, custom_model_objects, target_model_update,
                                **kwargs)

    def compile(self, optimizer, metrics=[]):
        self.agent1.compile(clone_optimizer(optimizer), deepcopy(metrics))
        self.agent2.compile(clone_optimizer(optimizer), deepcopy(metrics))

    def fit(self, env, nb_steps, action_repetition=1, callbacks=None, verbose=1,
            visualize=False, nb_max_start_steps=0, start_step_policy=None, log_interval=10000,
            nb_max_episode_steps=None):

        if not (self.agent1.compiled and self.agent2.compiled):
            raise RuntimeError(
                'Your tried to fit your agent but it hasn\'t been compiled yet. Please call `compile()` before `fit()`.')
        if action_repetition < 1:
            raise ValueError('action_repetition must be >= 1, is {}'.format(action_repetition))

        self.agent1.training = True
        self.agent2.training = True

        callbacks = [] if not callbacks else callbacks[:]

        if verbose == 1:
            callbacks += [TrainIntervalLogger(interval=log_interval)]
        elif verbose > 1:
            callbacks += [TrainEpisodeLogger()]
        if visualize:
            callbacks += [Visualizer()]
        history = History()
        callbacks += [history]
        callbacks = CallbackList(callbacks)
        if hasattr(callbacks, 'set_model'):
            callbacks.set_model(self)
        else:
            callbacks._set_model(self)
        callbacks._set_env(env)
        params = {
            'nb_steps': nb_steps,
        }
        if hasattr(callbacks, 'set_params'):
            callbacks.set_params(params)
        else:
            callbacks._set_params(params)
        self.agent1._on_train_begin()
        self.agent2._on_train_begin()
        callbacks.on_train_begin()

        episode = np.int16(0)
        self.agent1.step = np.int16(0)
        self.agent2.step = np.int16(0)
        observation = None
        episode_reward1 = None
        episode_reward2 = None
        episode_step = None
        did_abort = False
        try:
            while self.agent1.step < nb_steps:  # not individual for now
                if observation is None:  # start of a new episode
                    callbacks.on_episode_begin(episode)
                    episode_step = np.int16(0)
                    episode_reward1 = np.float32(0)
                    episode_reward2 = np.float32(0)

                    # Obtain the initial observation by resetting the environment.
                    self.agent1.reset_states()
                    self.agent2.reset_states()
                    observation = deepcopy(env.reset())
                    if self.agent1.processor is not None:  # not individual for now
                        observation = self.agent1.processor.process_observation(observation)
                    assert observation is not None

                    # Perform random starts at beginning of episode and do not record them into the experience.
                    # This slightly changes the start position between games.
                    nb_random_start_steps = 0 if nb_max_start_steps == 0 else np.random.randint(nb_max_start_steps)
                    for _ in range(nb_random_start_steps):
                        if start_step_policy is None:
                            action = env.action_space.sample()
                        else:
                            action = start_step_policy(observation)
                        if self.agent1.processor is not None:  # not individual for now. action is not from agent anyway
                            action = self.agent1.processor.process_action(action)
                        callbacks.on_action_begin(action)
                        observation, reward, done, info = env.step(action)
                        observation = deepcopy(observation)
                        if self.agent1.processor is not None:
                            observation, reward, done, info = self.agent1.processor.process_step(observation, reward,
                                                                                                 done, info)
                        callbacks.on_action_end(action)
                        if done:
                            warnings.warn(
                                'Env ended before {} random steps could be performed at the start. '
                                'You should probably lower the `nb_max_start_steps` parameter.'.format(
                                    nb_random_start_steps))
                            observation = deepcopy(env.reset())
                            if self.agent1.processor is not None:
                                observation = self.agent1.processor.process_observation(observation)
                            break

                # At this point, we expect to be fully initialized.
                assert episode_reward1 is not None
                assert episode_reward2 is not None
                assert episode_step is not None
                assert observation is not None

                # Run a single step.
                callbacks.on_step_begin(episode_step)
                # This is were all of the work happens. We first perceive and compute the action
                # (forward step) and then use the reward to improve (backward step).
                action1 = self.agent1.forward(observation)
                action2 = self.agent2.forward(observation)
                if self.agent1.processor is not None:
                    action1 = self.agent1.processor.process_action(action1)
                if self.agent2.processor is not None:
                    action2 = self.agent2.processor.process_action(action2)
                action = (np.ndarray.item(action1), np.ndarray.item(action2))
                reward1 = np.float32(0)
                reward2 = np.float32(0)
                reward = np.float32(0)
                accumulated_info = {}
                done = False
                for _ in range(action_repetition):
                    callbacks.on_action_begin(action)  # Use only one of the actions? added actions?
                    observation, r, done, info = env.step(action)
                    observation = deepcopy(observation)
                    if self.agent1.processor is not None:
                        observation, r, done, info = self.agent1.processor.process_step(observation, r, done, info)
                    for key, value in info.items():
                        if not np.isreal(value):
                            continue
                        if key not in accumulated_info:
                            accumulated_info[key] = np.zeros_like(value)
                        accumulated_info[key] += value
                    callbacks.on_action_end(action)
                    reward1 += info["r1"]
                    reward2 += info["r2"]
                    reward += info["r1"] + info["r2"]
                    if done:
                        break
                if nb_max_episode_steps and episode_step >= nb_max_episode_steps - 1:
                    # Force a terminal state.
                    done = True
                metrics1 = self.agent1.backward(reward1, terminal=done)
                metrics2 = self.agent2.backward(reward2, terminal=done)
                episode_reward1 += reward1
                episode_reward2 += reward2

                step_logs = {
                    'action': action,
                    'observation': observation,
                    'reward': reward,
                    'metrics': metrics1,  # not individual for now
                    'episode': episode,
                    'info': accumulated_info,
                }
                callbacks.on_step_end(episode_step, step_logs)
                episode_step += 1
                self.agent1.step += 1
                self.agent2.step += 1

                if done:
                    # We are in a terminal state but the agent hasn't yet seen it. We therefore
                    # perform one more forward-backward call and simply ignore the action before
                    # resetting the environment. We need to pass in `terminal=False` here since
                    # the *next* state, that is the state of the newly reset environment, is
                    # always non-terminal by convention.
                    self.agent1.forward(observation)
                    self.agent2.forward(observation)
                    self.agent1.backward(0., terminal=False)
                    self.agent2.backward(0., terminal=False)

                    # This episode is finished, report and reset.
                    episode_logs = {
                        'episode_reward': episode_reward1 + episode_reward2,
                        'nb_episode_steps': episode_step,
                        'nb_steps': self.agent1.step,  # not individual for now
                    }
                    callbacks.on_episode_end(episode, episode_logs)

                    episode += 1
                    observation = None
                    episode_step = None
                    episode_reward1 = None
                    episode_reward2 = None
        except KeyboardInterrupt:
            # We catch keyboard interrupts here so that training can be be safely aborted.
            # This is so common that we've built this right into this function, which ensures that
            # the `on_train_end` method is properly called.
            did_abort = True
        callbacks.on_train_end(logs={'did_abort': did_abort})
        self.agent1._on_train_end()
        self.agent2._on_train_end()


        return history