Exemplo n.º 1
0
    def test(self, env, nb_episodes=1, action_repetition=1, callbacks=None, visualize=True,
             nb_max_episode_steps=None, nb_max_start_steps=0, start_step_policy=None, verbose=1):
        """Callback that is called before training begins."
        """
        if not self.compiled:
            raise RuntimeError('Your tried to test your agent but it hasn\'t been compiled yet. Please call `compile()` before `test()`.')
        if action_repetition < 1:
            raise ValueError('action_repetition must be >= 1, is {}'.format(action_repetition))

        self.training = False
        self.step = 0

        callbacks = [] if not callbacks else callbacks[:]

        if verbose >= 1:
            callbacks += [TestLogger()]
        if visualize:
            callbacks += [Visualizer()]
        history = History()
        callbacks += [history]
        callbacks = CallbackList(callbacks)
        if hasattr(callbacks, 'set_model'):
            callbacks.set_model(self)
        else:
            callbacks._set_model(self)
        callbacks._set_env(env)
        params = {
            'nb_episodes': nb_episodes,
        }
        if hasattr(callbacks, 'set_params'):
            callbacks.set_params(params)
        else:
            callbacks._set_params(params)

        self._on_test_begin()
        callbacks.on_train_begin()
        for episode in range(nb_episodes):
            callbacks.on_episode_begin(episode)
            episode_reward = 0.
            episode_step = 0

            # Obtain the initial observation by resetting the environment.
            self.reset_states()
            observation = deepcopy(env.reset())
            if self.processor is not None:
                observation = self.processor.process_observation(observation)
            assert observation is not None

            # Perform random starts at beginning of episode and do not record them into the experience.
            # This slightly changes the start position between games.
            nb_random_start_steps = 0 if nb_max_start_steps == 0 else np.random.randint(nb_max_start_steps)
            for _ in range(nb_random_start_steps):
                if start_step_policy is None:
                    action = env.action_space.sample()
                else:
                    action = start_step_policy(observation)
                if self.processor is not None:
                    action = self.processor.process_action(action)
                callbacks.on_action_begin(action)
                observation, r, done, info = env.step(action)
                observation = deepcopy(observation)
                if self.processor is not None:
                    observation, r, done, info = self.processor.process_step(observation, r, done, info)
                callbacks.on_action_end(action)
                if done:
                    warnings.warn('Env ended before {} random steps could be performed at the start. You should probably lower the `nb_max_start_steps` parameter.'.format(nb_random_start_steps))
                    observation = deepcopy(env.reset())
                    if self.processor is not None:
                        observation = self.processor.process_observation(observation)
                    break

            # Run the episode until we're done.
            done = False
            while not done:
                callbacks.on_step_begin(episode_step)

                action = self.forward(observation)
                if self.processor is not None:
                    action = self.processor.process_action(action)
                reward = 0.
                accumulated_info = {}
                for _ in range(action_repetition):
                    callbacks.on_action_begin(action)
                    observation, r, d, info = env.step(action)
                    observation = deepcopy(observation)
                    if self.processor is not None:
                        observation, r, d, info = self.processor.process_step(observation, r, d, info)
                    callbacks.on_action_end(action)
                    reward += r
                    for key, value in info.items():
                        if not np.isreal(value):
                            continue
                        if key not in accumulated_info:
                            accumulated_info[key] = np.zeros_like(value)
                        accumulated_info[key] += value
                    if d:
                        done = True
                        break
                if nb_max_episode_steps and episode_step >= nb_max_episode_steps - 1:
                    done = True
                self.backward(reward, terminal=done)
                episode_reward += reward

                step_logs = {
                    'action': action,
                    'observation': observation,
                    'reward': reward,
                    'episode': episode,
                    'info': accumulated_info,
                }
                callbacks.on_step_end(episode_step, step_logs)
                episode_step += 1
                self.step += 1

            # We are in a terminal state but the agent hasn't yet seen it. We therefore
            # perform one more forward-backward call and simply ignore the action before
            # resetting the environment. We need to pass in `terminal=False` here since
            # the *next* state, that is the state of the newly reset environment, is
            # always non-terminal by convention.
            self.forward(observation)
            self.backward(0., terminal=False)

            # Report end of episode.
            episode_logs = {
                'episode_reward': episode_reward,
                'nb_steps': episode_step,
            }
            callbacks.on_episode_end(episode, episode_logs)
        callbacks.on_train_end()
        self._on_test_end()

        return history
Exemplo n.º 2
0
def test_new(self,
             env,
             nb_episodes=1,
             action_repetition=1,
             callbacks=None,
             visualize=True,
             nb_max_episode_steps=None,
             nb_max_start_steps=0,
             start_step_policy=None,
             verbose=1,
             arr=None):
    """Callback that is called before training begins."
    """
    if not self.compiled:
        raise RuntimeError(
            'Your tried to test your agent but it hasn\'t been compiled yet. Please call `compile()` before `test()`.'
        )
    if action_repetition < 1:
        raise ValueError(
            'action_repetition must be >= 1, is {}'.format(action_repetition))

    self.training = False
    self.step = 0

    callbacks = [] if not callbacks else callbacks[:]

    if verbose >= 1:
        callbacks += [TestLogger()]
    if visualize:
        callbacks += [Visualizer()]
    history = History()
    callbacks += [history]
    callbacks = CallbackList(callbacks)
    if hasattr(callbacks, 'set_model'):
        callbacks.set_model(self)
    else:
        callbacks._set_model(self)
    callbacks._set_env(env)
    params = {
        'nb_episodes': nb_episodes,
    }
    if hasattr(callbacks, 'set_params'):
        callbacks.set_params(params)
    else:
        callbacks._set_params(params)

    self._on_test_begin()
    callbacks.on_train_begin()
    for episode in range(nb_episodes):
        callbacks.on_episode_begin(episode)
        episode_reward = 0.
        episode_step = 0

        # Obtain the initial observation by resetting the environment.
        self.reset_states()
        observation = deepcopy(env.reset())
        if self.processor is not None:
            observation = self.processor.process_observation(observation)
        assert observation is not None

        for ac in arr:
            # print type(ac), ac
            if self.processor is not None:
                ac = self.processor.process_action(ac)
            callbacks.on_action_begin(ac)
            observation, reward, done, info = env.step(ac)
            observation = deepcopy(observation)
            if self.processor is not None:
                observation, reward, done, info = self.processor.process_step(
                    observation, reward, done, info)
            callbacks.on_action_end(ac)
            self.step += 1
            episode_step += 1
            episode_reward += reward
            if done:
                #warnings.warn('Env ended before the deterministic non-neural steps could end.')
                observation = deepcopy(env.reset())
                if self.processor is not None:
                    observation = self.processor.process_observation(
                        observation)
                break

        # Perform random starts at beginning of episode and do not record them into the experience.
        # This slightly changes the start position between games.
        nb_random_start_steps = 0 if nb_max_start_steps == 0 else np.random.randint(
            nb_max_start_steps)
        for _ in range(nb_random_start_steps):
            if start_step_policy is None:
                action = env.action_space.sample()
            else:
                action = start_step_policy(observation)
            if self.processor is not None:
                action = self.processor.process_action(action)
            callbacks.on_action_begin(action)
            observation, r, done, info = env.step(action)
            observation = deepcopy(observation)
            if self.processor is not None:
                observation, r, done, info = self.processor.process_step(
                    observation, r, done, info)
            callbacks.on_action_end(action)

            if done:
                warnings.warn(
                    'Env ended before {} random steps could be performed at the start. You should probably lower the `nb_max_start_steps` parameter.'
                    .format(nb_random_start_steps))
                observation = deepcopy(env.reset())
                if self.processor is not None:
                    observation = self.processor.process_observation(
                        observation)
                break

        # Run the episode until we're done.
        done = False
        while not done:
            callbacks.on_step_begin(episode_step)

            action = self.forward(observation)
            if self.processor is not None:
                action = self.processor.process_action(action)
            reward = 0.
            accumulated_info = {}
            for _ in range(action_repetition):
                callbacks.on_action_begin(action)
                observation, r, d, info = env.step(action)
                observation = deepcopy(observation)
                if self.processor is not None:
                    observation, r, d, info = self.processor.process_step(
                        observation, r, d, info)
                callbacks.on_action_end(action)
                reward += r
                for key, value in info.items():
                    if not np.isreal(value):
                        continue
                    if key not in accumulated_info:
                        accumulated_info[key] = np.zeros_like(value)
                    accumulated_info[key] += value
                if d:
                    done = True
                    break
            if nb_max_episode_steps and episode_step >= nb_max_episode_steps - 1:
                done = True
            self.backward(reward, terminal=done)
            episode_reward += reward

            step_logs = {
                'action': action,
                'observation': observation,
                'reward': reward,
                'episode': episode,
                'info': accumulated_info,
            }
            callbacks.on_step_end(episode_step, step_logs)
            episode_step += 1
            self.step += 1

        # We are in a terminal state but the agent hasn't yet seen it. We therefore
        # perform one more forward-backward call and simply ignore the action before
        # resetting the environment. We need to pass in `terminal=False` here since
        # the *next* state, that is the state of the newly reset environment, is
        # always non-terminal by convention.
        self.forward(observation)
        self.backward(0., terminal=False)

        # Report end of episode.
        episode_logs = {
            'episode_reward': episode_reward,
            'nb_steps': episode_step,
        }
        callbacks.on_episode_end(episode, episode_logs)
    callbacks.on_train_end()
    self._on_test_end()

    return history
Exemplo n.º 3
0
class RemoteAdfp(object):
    def __init__(self,
                 agent: ADFPAgent,
                 training_steps,
                 log_interval,
                 folder_path,
                 callbacks=None,
                 mode='train',
                 processor=None):

        # Prepare Callbacks
        callbacks = [] if not callbacks else callbacks[:]
        callbacks += [TrainIntervalLogger(interval=log_interval)]
        self.callbacks = CallbackList(callbacks)
        if hasattr(self.callbacks, 'set_model'):
            self.callbacks.set_model(agent)
        else:
            self.callbacks._set_model(agent)
        params = {
            'nb_steps': training_steps,
        }
        if hasattr(self.callbacks, 'set_params'):
            self.callbacks.set_params(params)
        else:
            self.callbacks._set_params(params)

        self.callbacks.on_train_begin()
        self.no_training_steps = training_steps

        # Create needed directories if not done yet
        self.folder_path = folder_path
        checkpoint_path = folder_path + '/checkpoints'
        if not os.path.exists(checkpoint_path):
            os.makedirs(checkpoint_path)

        # Parameters
        self.agent = agent
        self.episode_step = 0
        self.episode = 0
        self.episode_reward = 0
        self.step = 0
        self.recent_action = None
        self.recent_observation = None
        self.mode = mode
        self.processor = processor

    def train_move(self, raw_observation, measurement, goal_params, done):

        self.update()
        self.callbacks.on_step_begin(self.episode_step)

        if self.processor is not None:
            raw_observation = self.processor.process_observation(
                observation=raw_observation)
            measurement = self.processor.process_measurement(measurement)

        # If we have a list of goal_params just take one element for evaluation.
        eval_goal_params = goal_params if not isinstance(
            goal_params[0], list) else goal_params[-1]
        reward = self.agent.goal.immediate_reward_function(
            measurement, eval_goal_params)

        if self.step > 1:
            metrics = self.agent.backward(measurements=measurement,
                                          terminal=done)
            step_logs = {
                'action': self.recent_action,
                'observation': self.recent_observation,
                'reward': reward,
                'metrics': metrics,
                'episode': self.episode,
                'info': {},
            }
            self.episode_reward += reward
            self.callbacks.on_step_end(self.episode_step, step_logs)

        # perform next step
        if done:
            # report
            episode_logs = {
                'episode_reward': self.episode_reward,
                'nb_episode_steps': self.episode_step,
                'reward_per_step': self.episode_reward / self.episode_step
            }
            self.callbacks.on_episode_end(self.episode, episode_logs)

            self.episode += 1
            self.episode_step = 0
            self.episode_reward = 0.

            return

        action = self.agent.forward(observation=Observation(
            raw_features=raw_observation, measurements=measurement),
                                    goal_params=goal_params)

        # Update params for next backprop
        self.recent_observation = raw_observation
        self.recent_action = action

        return action

    def test_move(self, raw_observation, measurement, goal_params):

        if self.processor is not None:
            raw_observation = self.processor.process_observation(
                observation=raw_observation)
            measurement = self.processor.process_measurement(measurement)

        return self.agent.forward(Observation(raw_features=raw_observation,
                                              measurements=measurement),
                                  goal_params=goal_params)

    def save(self):
        self.agent.save(self.folder_path)

    def update(self):
        if self.episode_step == 0:
            self.callbacks.on_episode_begin(self.episode)

        # Is training ended yet?
        if self.step >= self.no_training_steps:
            self.save()
            # We are done here.
            self.callbacks.on_train_end()
            sys.exit(0)

        self.episode_step += 1
        self.step += 1
Exemplo n.º 4
0
    def fit(self,
            env,
            nb_steps,
            action_repetition=1,
            callbacks=None,
            verbose=1,
            visualize=False,
            nb_max_start_steps=0,
            start_step_policy=None,
            log_interval=10000,
            useShaping=False,
            learnAMDP=False,
            stateToBucket=None,
            vae=None,
            shapingFunction=None,
            nb_max_episode_steps=None,
            projectionModel=None,
            episodeToBegin=0,
            stepToBegin=0,
            extraWarmup=0,
            doTraining=True):
        """Trains the agent on the given environment.

        # Arguments
            env: (`Env` instance): Environment that the agent interacts with. See [Env](#env) for details.
            nb_steps (integer): Number of training steps to be performed.
            action_repetition (integer): Number of times the agent repeats the same action without
                observing the environment again. Setting this to a value > 1 can be useful
                if a single action only has a very small effect on the environment.
            callbacks (list of `keras.callbacks.Callback` or `rl.callbacks.Callback` instances):
                List of callbacks to apply during training. See [callbacks](/callbacks) for details.
            verbose (integer): 0 for no logging, 1 for interval logging (compare `log_interval`), 2 for episode logging
            visualize (boolean): If `True`, the environment is visualized during training. However,
                this is likely going to slow down training significantly and is thus intended to be
                a debugging instrument.
            nb_max_start_steps (integer): Number of maximum steps that the agent performs at the beginning
                of each episode using `start_step_policy`. Notice that this is an upper limit since
                the exact number of steps to be performed is sampled uniformly from [0, max_start_steps]
                at the beginning of each episode.
            start_step_policy (`lambda observation: action`): The policy
                to follow if `nb_max_start_steps` > 0. If set to `None`, a random action is performed.
            log_interval (integer): If `verbose` = 1, the number of steps that are considered to be an interval.
            nb_max_episode_steps (integer): Number of steps per episode that the agent performs before
                automatically resetting the environment. Set to `None` if each episode should run
                (potentially indefinitely) until the environment signals a terminal state.

        # Returns
            A `keras.callbacks.History` instance that recorded the entire training process.
        """

        fittingMode = ""
        if not useShaping and not learnAMDP:
            fittingMode = "NoShaping"
        elif learnAMDP and not useShaping:
            fittingMode = "learnAMDP"
        elif learnAMDP and useShaping:
            fittingMode = "learnAndUseAMDP"
        elif useShaping and not shapingFunction is None and projectionModel is None:
            fittingMode = "useShapingFunction"
        elif useShaping and not projectionModel is None and shapingFunction is None:
            fittingMode = "useProjectionModel"
        else:
            raise Exception("Invalid Combination of Options")

        print("Fitting Mode Is:")
        print(fittingMode)

        if not self.compiled:
            raise RuntimeError(
                'Your tried to fit your agent but it hasn\'t been compiled yet. Please call `compile()` before `fit()`.'
            )
        if action_repetition < 1:
            raise ValueError('action_repetition must be >= 1, is {}'.format(
                action_repetition))

        self.useShaping = useShaping
        self.training = doTraining
        self.stateToBucket = stateToBucket
        if not projectionModel is None:
            self.projectionModel = projectionModel[0]
            self.projectionGraph = projectionModel[1]
            self.projectionSession = projectionModel[2]

        if not shapingFunction is None:
            self.shapingModel = shapingFunction[0]
            self.shapingGraph = shapingFunction[1]
            self.shapingSession = shapingFunction[2]

        sess = vae[0]
        vaeNetwork = vae[1]
        self.printVae = False

        self.extraWarmup = extraWarmup

        callbacks = [] if not callbacks else callbacks[:]

        if verbose == 1:
            callbacks += [TrainIntervalLogger(interval=log_interval)]
        elif verbose > 1:
            callbacks += [TrainEpisodeLogger()]
        if visualize:
            callbacks += [Visualizer()]
        history = History()
        callbacks += [history]
        callbacks = CallbackList(callbacks)
        if hasattr(callbacks, 'set_model'):
            callbacks.set_model(self)
        else:
            callbacks._set_model(self)
        callbacks._set_env(env)
        params = {
            'nb_steps': nb_steps,
        }
        if hasattr(callbacks, 'set_params'):
            callbacks.set_params(params)
        else:
            callbacks._set_params(params)
        self._on_train_begin()
        callbacks.on_train_begin()

        self.stepToBegin = stepToBegin
        self.episode = episodeToBegin
        self.step = stepToBegin
        self.neg_reward_counter = np.int16(0)
        self.max_neg_rewards = np.int16(12)
        observation = None
        previousObservation = None

        episode_reward = None
        episode_step = None
        did_abort = False

        if fittingMode in ["learnAMDP", "learnAndUseAMDP"]:
            self.amdp = deepAMDP(numberOfActions=env.action_space.n)
        latentStatesVisited = []

        episodeStateHistory = []
        episodeColourStateHistory = []

        try:
            while self.step < nb_steps:
                if observation is None:  # start of a new episode
                    callbacks.on_episode_begin(self.episode)
                    previousObservation = None

                    episode_step = np.int16(0)
                    episode_reward = np.float32(0)
                    self.accumulatedExtrinsicReward = 0
                    self.accumulatedReward = 0
                    self.accumulatedSteps = 0
                    episodeStateHistory = []
                    episodeColourStateHistory = []

                    # Obtain the initial observation by resetting the environment.
                    self.reset_states()
                    observation = deepcopy(env.reset())
                    colourObservation = observation

                    if self.processor is not None:
                        observation = self.processor.process_observation(
                            observation)
                    assert observation is not None
                    episodeStateHistory.append(observation)
                    episodeColourStateHistory.append(colourObservation)
                    # Perform random starts at beginning of episode and do not record them into the experience.
                    # This slightly changes the start position between games.
                    nb_random_start_steps = 0 if nb_max_start_steps == 0 else np.random.randint(
                        nb_max_start_steps)
                    for _ in range(nb_random_start_steps):
                        if start_step_policy is None:
                            action = env.action_space.sample()
                        else:
                            action = start_step_policy(observation)
                        if self.processor is not None:
                            action = self.processor.process_action(action)
                        callbacks.on_action_begin(action)
                        observation, reward, done, info = env.step(action)
                        observation = deepcopy(observation)
                        colourObservation = observation
                        if self.processor is not None:
                            observation, reward, done, info = self.processor.process_step(
                                observation, reward, done, info)
                        episodeStateHistory.append(observation)
                        episodeColourStateHistory.append(colourObservation)
                        callbacks.on_action_end(action)
                        if done:
                            warnings.warn(
                                'Env ended before {} random steps could be performed at the start. You should probably lower the `nb_max_start_steps` parameter.'
                                .format(nb_random_start_steps))
                            observation = deepcopy(env.reset())
                            colourObservation = observation
                            if self.processor is not None:
                                observation = self.processor.process_observation(
                                    observation)
                            break

                # At this point, we expect to be fully initialized.
                assert episode_reward is not None
                assert episode_step is not None
                assert observation is not None

                # Run a single step.
                callbacks.on_step_begin(episode_step)
                # This is were all of the work happens. We first perceive and compute the action
                # (forward step) and then use the reward to improve (backward step).
                action = self.forward(observation)
                if self.processor is not None:
                    action = self.processor.process_action(action)
                reward = np.float32(0)
                accumulated_info = {}
                done = False

                self.accumulatedExtrinsicReward = 0  ###
                #print(action_repetition)
                for _ in range(action_repetition):

                    callbacks.on_action_begin(action)
                    previousObservation = observation
                    previousColourObservation = colourObservation
                    observation, r, done, info = env.step(action)

                    if self.printVae:

                        sess = vae[0]
                        vaeNetwork = vae[1]
                        #print(vae.encoder(tf.image.resize_images(observation.reshape(1,96,96,3), [64, 64])))
                        obs = sess.run(vaeNetwork.z,
                                       feed_dict={
                                           vaeNetwork.image:
                                           observation[None, :, :, :]
                                       })
                        #print(obs)
                        latentStatesVisited.append(obs)

                    self.accumulatedReward += r
                    self.accumulatedSteps += 1

                    colourObservation = observation
                    #self.colourMemory.append(colourObservation,0,0,0)

                    #print(observation.shape)
                    if self.processor is not None:
                        observation, r, done, info = self.processor.process_step(
                            observation, r, done, info)

                    for key, value in info.items():
                        if not np.isreal(value):
                            continue
                        if key not in accumulated_info:
                            accumulated_info[key] = np.zeros_like(value)
                        accumulated_info[key] += value
                    callbacks.on_action_end(action)
                    reward += r
                    if done:
                        break

                if fittingMode in [
                        "useProjectionModel", "useShapingFunction", "learnAMDP"
                ]:

                    if fittingMode in ["useProjectionModel"]:
                        if len(episodeStateHistory) < 4:

                            if len(episodeStateHistory) == 0:
                                stackedObservations = np.array([
                                    np.zeros(observation.shape),
                                    np.zeros(observation.shape),
                                    np.zeros(observation.shape), observation
                                ])
                                previousStackedObservations = np.array([
                                    np.zeros(observation.shape),
                                    np.zeros(observation.shape),
                                    np.zeros(observation.shape),
                                    np.zeros(observation.shape)
                                ])
                            elif len(episodeStateHistory) == 1:
                                stackedObservations = np.array([
                                    np.zeros(observation.shape),
                                    np.zeros(observation.shape),
                                    episodeStateHistory[-1], observation
                                ])
                                previousStackedObservations = np.array([
                                    np.zeros(observation.shape),
                                    np.zeros(observation.shape),
                                    np.zeros(observation.shape),
                                    episodeStateHistory[-1]
                                ])
                            elif len(episodeStateHistory) == 2:
                                stackedObservations = np.array([
                                    np.zeros(observation.shape),
                                    episodeStateHistory[-2],
                                    episodeStateHistory[-1], observation
                                ])
                                previousStackedObservations = np.array([
                                    np.zeros(observation.shape),
                                    np.zeros(observation.shape),
                                    episodeStateHistory[-2],
                                    episodeStateHistory[-1]
                                ])
                            elif len(episodeStateHistory) == 3:
                                stackedObservations = np.array([
                                    episodeStateHistory[-3],
                                    episodeStateHistory[-2],
                                    episodeStateHistory[-1], observation
                                ])
                                previousStackedObservations = np.array([
                                    np.zeros(observation.shape),
                                    episodeStateHistory[-3],
                                    episodeStateHistory[-2],
                                    episodeStateHistory[-1]
                                ])
                        else:
                            stackedObservations = np.array([
                                episodeStateHistory[-3],
                                episodeStateHistory[-2],
                                episodeStateHistory[-1], observation
                            ])
                            previousStackedObservations = np.array([
                                episodeStateHistory[-4],
                                episodeStateHistory[-3],
                                episodeStateHistory[-2],
                                episodeStateHistory[-1]
                            ])

                        with self.projectionGraph.as_default():
                            with self.projectionSession.as_default():

                                potentialCurrentState = max(
                                    self.projectionModel.predict(
                                        np.array([stackedObservations]))[0])
                                potentialPreviousState = max(
                                    self.projectionModel.predict(
                                        np.array([previousStackedObservations
                                                  ]))[0])
                                discountedDifference = self.gamma * potentialCurrentState - potentialPreviousState
                                #print(discountedDifference)

                    elif fittingMode in ["useShapingFunction", "learnAMDP"]:

                        if len(episodeColourStateHistory) < 4:

                            if len(episodeColourStateHistory) == 0:
                                stackedObservations = np.array([
                                    np.zeros(colourObservation.shape),
                                    np.zeros(colourObservation.shape),
                                    np.zeros(colourObservation.shape),
                                    colourObservation
                                ])
                                previousStackedObservations = np.array([
                                    np.zeros(colourObservation.shape),
                                    np.zeros(colourObservation.shape),
                                    np.zeros(colourObservation.shape),
                                    np.zeros(colourObservation.shape)
                                ])
                            elif len(episodeColourStateHistory) == 1:
                                stackedObservations = np.array([
                                    np.zeros(colourObservation.shape),
                                    np.zeros(colourObservation.shape),
                                    episodeColourStateHistory[-1],
                                    colourObservation
                                ])
                                previousStackedObservations = np.array([
                                    np.zeros(colourObservation.shape),
                                    np.zeros(colourObservation.shape),
                                    np.zeros(colourObservation.shape),
                                    episodeColourStateHistory[-1]
                                ])
                            elif len(episodeColourStateHistory) == 2:
                                stackedObservations = np.array([
                                    np.zeros(colourObservation.shape),
                                    episodeColourStateHistory[-2],
                                    episodeColourStateHistory[-1],
                                    colourObservation
                                ])
                                previousStackedObservations = np.array([
                                    np.zeros(colourObservation.shape),
                                    np.zeros(colourObservation.shape),
                                    episodeColourStateHistory[-2],
                                    episodeColourStateHistory[-1]
                                ])
                            elif len(episodeColourStateHistory) == 3:
                                stackedObservations = np.array([
                                    episodeColourStateHistory[-3],
                                    episodeColourStateHistory[-2],
                                    episodeColourStateHistory[-1],
                                    colourObservation
                                ])
                                previousStackedObservations = np.array([
                                    np.zeros(colourObservation.shape),
                                    episodeColourStateHistory[-3],
                                    episodeColourStateHistory[-2],
                                    episodeColourStateHistory[-1]
                                ])
                        else:
                            stackedObservations = np.array([
                                episodeColourStateHistory[-3],
                                episodeColourStateHistory[-2],
                                episodeColourStateHistory[-1],
                                colourObservation
                            ])
                            previousStackedObservations = np.array([
                                episodeColourStateHistory[-4],
                                episodeColourStateHistory[-3],
                                episodeColourStateHistory[-2],
                                episodeColourStateHistory[-1]
                            ])

                        latentCurrentState = [
                            sess.run(vaeNetwork.z,
                                     feed_dict={
                                         vaeNetwork.image: obs[None, :, :, :]
                                     }).tolist()[0]
                            for obs in stackedObservations
                        ]
                        latentPreviousState = [
                            sess.run(vaeNetwork.z,
                                     feed_dict={
                                         vaeNetwork.image: obs[None, :, :, :]
                                     }).tolist()[0]
                            for obs in previousStackedObservations
                        ]
                        #latentPreviousState = list(chain.from_iterable(latentPreviousState))

                        if fittingMode in ["useShapingFunction"]:
                            with self.shapingGraph.as_default():
                                with self.shapingSession.as_default():
                                    #print(np.array(latentCurrentState).shape)
                                    latentCurrentState = np.array(
                                        latentCurrentState)
                                    latentPreviousState = np.array(
                                        latentPreviousState)

                                    latentCurrentState = latentCurrentState.reshape(
                                        (-1, 4, 32))
                                    latentPreviousState = latentPreviousState.reshape(
                                        (-1, 4, 32))
                                    #print(np.array(latentCurrentState).shape)

                                    potentialCurrentLatentState = max(
                                        self.shapingModel.predict(
                                            latentCurrentState)[0])
                                    potentialPreviousLatentState = max(
                                        self.shapingModel.predict(
                                            latentPreviousState)[0])
                                    #print(potentialCurrentLatentState, potentialPreviousLatentState)
                                    discountedDifference = self.gamma * potentialCurrentLatentState - potentialPreviousLatentState
                                    #discountedDifference = np.clip(discountedDifference, -10000, 10000)
                                    #print(discountedDifference)

                        if fittingMode in ["learnAMDP"]:
                            #print(latentCurrentState)
                            # print(np.array(latentCurrentState).shape)
                            self.amdp.addExperience(
                                np.array(latentCurrentState), action, reward,
                                done)
                            #                        discountedDifference = self.gamma*potentialCurrentState-potentialPreviousState
                            discountedDifference = 0
                    self.accumulatedExtrinsicReward = discountedDifference
                    #print(self.accumulatedExtrinsicReward)

                early_done, punishment = self.check_early_stop(
                    reward, episode_reward)
                if early_done:
                    reward += punishment
                done = done or early_done

                if nb_max_episode_steps and episode_step >= nb_max_episode_steps - 1:
                    # Force a terminal state.
                    done = True

                #if not currentAbstractState == previousAbstractState:
                #print(self.accumulatedExtrinsicReward)
                episodeStateHistory.append(observation)
                episodeColourStateHistory.append(colourObservation)
                if fittingMode in [
                        "learnAndUseAMDP", "useShapingFunction",
                        "useProjectionModel"
                ]:
                    #print(omega*self.accumulatedExtrinsicReward)
                    #print(self.accumulatedExtrinsicReward)
                    #print(self.accumulatedExtrinsicReward)

                    metrics = self.backward(
                        reward,
                        reward +
                        self.currentOmega * self.accumulatedExtrinsicReward,
                        terminal=done)
                elif fittingMode in ["learnAMDP"]:
                    metrics = self.backward(reward, reward, terminal=done)
                    if self.step > self.nb_steps_warmup:
                        self.amdp.replay()
                else:
                    metrics = self.backward(reward, reward, terminal=done)
                #
                episode_reward += reward

                step_logs = {
                    'action': action,
                    'observation': observation,
                    'reward': reward,
                    'metrics': metrics,
                    'episode': self.episode,
                    'info': accumulated_info,
                }
                callbacks.on_step_end(episode_step, step_logs)
                episode_step += 1
                self.step += 1

                if done:
                    # We are in a terminal state but the agent hasn't yet seen it. We therefore
                    # perform one more forward-backward call and simply ignore the action before
                    # resetting the environment. We need to pass in `terminal=False` here since
                    # the *next* state, that is the state of the newly reset environment, is
                    # always non-terminal by convention.
                    self.forward(observation)
                    self.backward(0., 0., terminal=False)

                    # This episode is finished, report and reset.
                    episode_logs = {
                        'episode_reward': episode_reward,
                        'nb_episode_steps': episode_step,
                        'nb_steps': self.step,
                    }
                    callbacks.on_episode_end(self.episode, episode_logs)

                    self.episode += 1

                    if self.omegaStart > 0:
                        self.currentOmega = max(
                            self.omegaStart +
                            (self.episode / self.omegaEpisodes) *
                            (self.omegaEnd - self.omegaStart), self.omegaEnd)
                    #if episode > 500:
                    #   self.currentOmega = 0
                    #  self.omegaStart = 0
                    # self.omegaEnd = 0

                    #print(self.currentOmega)

                    observation = None
                    episode_step = None
                    episode_reward = None
        except KeyboardInterrupt:
            # We catch keyboard interrupts here so that training can be be safely aborted.
            # This is so common that we've built this right into this function, which ensures that
            # the `on_train_end` method is properly called.
            did_abort = True
        callbacks.on_train_end(logs={'did_abort': did_abort})
        self._on_train_end()

        with open('latentVisited2.pickle', 'wb') as handle:
            pickle.dump(latentStatesVisited,
                        handle,
                        protocol=pickle.HIGHEST_PROTOCOL)

        return history
Exemplo n.º 5
0
class RemoteAdqn(object):
    def __init__(self,
                 agent: ADQNAgent,
                 training_steps,
                 log_interval,
                 folder_path,
                 agent_persistence_manager=AgentPersistenceManager(),
                 agent_pool_size=1,
                 callbacks=None):

        # Prepare Callbacks
        callbacks = [] if not callbacks else callbacks[:]
        callbacks += [TrainIntervalLogger(interval=log_interval)]
        self.callbacks = CallbackList(callbacks)
        if hasattr(self.callbacks, 'set_model'):
            self.callbacks.set_model(agent)
        else:
            self.callbacks._set_model(agent)
        params = {
            'nb_steps': training_steps,
        }
        if hasattr(self.callbacks, 'set_params'):
            self.callbacks.set_params(params)
        else:
            self.callbacks._set_params(params)

        self.callbacks.on_train_begin()
        self.no_training_steps = training_steps
        self.agent_persistence_manager = agent_persistence_manager

        # Create needed directories if not done yet
        self.folder_path = folder_path
        checkpoint_path = folder_path + '/checkpoints'
        if not os.path.exists(checkpoint_path):
            os.makedirs(checkpoint_path)

        # Prepare Agent
        self.agent = agent
        self.agent.step = 0
        agent.training = True
        self.agent._on_train_begin()
        # Other parameters
        self.episode_step = 0
        self.episode = 0
        self.episode_reward = 0

        # create agent-pool
        self.agent_pool = [self.agent]
        if agent_pool_size > 1:
            for i in range(agent_pool_size - 1):
                aux_agent = ADQNAgent(
                    model=self.agent.model,
                    policy=self.agent.policy,
                    action_provider=self.agent.action_provider,
                    memory=self.agent.memory,
                    processor=self.agent.processor,
                    nb_steps_warmup=10,
                    gamma=.99,
                    delta_range=(-1., 1.),
                    target_model_update=100,
                    train_interval=4,
                    window_length=self.agent.window_length)
                self.agent_pool.append(aux_agent)

    def train_move(self, observation, reward, done):
        return self.single_agent_move(observation,
                                      reward,
                                      done,
                                      agent=self.agent)

    def test_move(self, observation):
        self.agent.training = False
        return self.agent.forward(observation)

    def save(self):
        self.agent_persistence_manager.save_agent(self.agent, self.folder_path)
        # TODO save config
        # TODO access model params
        # TODO access
        # TODO save metadata of model
        # TODO save a training history

    def single_agent_move(self, observation, reward, done, agent: Agent):
        self.agent.training = True
        if self.episode_step == 0:
            self.callbacks.on_episode_begin(self.episode)
        self.callbacks.on_step_begin(self.episode_step)

        # Is training ended yet?
        if agent.step >= self.no_training_steps:
            self.save()
            # We are done here.
            self.callbacks.on_train_end()
            sys.exit(0)

        # audit latest step
        if reward != -100 and len(agent.recent_observations) > 0:
            metrics = agent.backward(reward=reward, terminal=done)
            step_logs = {
                'action': agent.recent_action,
                'observation': agent.recent_observations[-1],
                'reward': reward,
                'metrics': metrics,
                'episode': self.
                episode,  # We may count episodes and steps globally, as the agents share a state.
                'info': {},
            }
            self.episode_reward += reward
            self.callbacks.on_step_end(self.episode_step, step_logs)
        # perform next step
        if done:
            # report
            episode_logs = {
                'episode_reward': self.episode_reward,
                'nb_episode_steps': self.episode_step
            }
            self.callbacks.on_episode_end(self.episode, episode_logs)

            self.episode += 1
            self.episode_step = 0
            self.episode_reward = 0

            return

        else:
            self.episode_step += 1
        action = agent.forward(observation)
        agent.step += 1

        return action
Exemplo n.º 6
0
    def fit(self, env, env_1, nb_steps, action_repetition=1, callbacks=None, verbose=1,
            visualize=False, nb_max_start_steps=0, start_step_policy=None, log_interval=10000,
            file_interval=200,nb_max_episode_steps=None,save_data_path='temp.json', dynamic_actor_exploration=False, update_exploration_interval=5000):
        """Trains the agent on the given environment.

        # Arguments
            env: (`Env` instance): Environment that the agent interacts with. See [Env](#env) for details.
            nb_steps (integer): Number of training steps to be performed.
            action_repetition (integer): Number of times the agent repeats the same action without
                observing the environment again. Setting this to a value > 1 can be useful
                if a single action only has a very small effect on the environment.
            callbacks (list of `keras.callbacks.Callback` or `rl.callbacks.Callback` instances):
                List of callbacks to apply during training. See [callbacks](/callbacks) for details.
            verbose (integer): 0 for no logging, 1 for interval logging (compare `log_interval`), 2 for episode logging
            visualize (boolean): If `True`, the environment is visualized during training. However,
                this is likely going to slow down training significantly and is thus intended to be
                a debugging instrument.
            nb_max_start_steps (integer): Number of maximum steps that the agent performs at the beginning
                of each episode using `start_step_policy`. Notice that this is an upper limit since
                the exact number of steps to be performed is sampled uniformly from [0, max_start_steps]
                at the beginning of each episode.
            start_step_policy (`lambda observation: action`): The policy
                to follow if `nb_max_start_steps` > 0. If set to `None`, a random action is performed.
            log_interval (integer): If `verbose` = 1, the number of steps that are considered to be an interval.
            nb_max_episode_steps (integer): Number of steps per episode that the agent performs before
                automatically resetting the environment. Set to `None` if each episode should run
                (potentially indefinitely) until the environment signals a terminal state.

        # Returns
            A `keras.callbacks.History` instance that recorded the entire training process.
        """
        if not self.compiled:
            raise RuntimeError('Your tried to fit your agent but it hasn\'t been compiled yet. Please call `compile()` before `fit()`.')
        if action_repetition < 1:
            raise ValueError('action_repetition must be >= 1, is {}'.format(action_repetition))

        self.episode_goal = None    # (resets every episode)
        self.training = True

        callbacks = [] if not callbacks else callbacks[:]

        if verbose == 1:
            callbacks += [TrainIntervalLogger(interval=log_interval)]
        elif verbose > 1:
            callbacks += [TrainEpisodeLogger()]
        if visualize:
            callbacks += [Visualizer()]

        callbacks += [OjasFileLogger(save_data_path,interval=file_interval)]

        history = History()
        callbacks += [history]
        callbacks = CallbackList(callbacks)
        if hasattr(callbacks, 'set_model'):
            callbacks.set_model(self)
        else:
            callbacks._set_model(self)
        callbacks._set_env(env)
        params = {
            'nb_steps': nb_steps,
        }
        if hasattr(callbacks, 'set_params'):
            callbacks.set_params(params)
        else:
            callbacks._set_params(params)
        self._on_train_begin()
        callbacks.on_train_begin()

        episode = 0
        self.step = 0
        observation1 = None
        observation2 = None
        episode_reward1 = None
        episode_step = None
        did_abort = False
        try:
            while self.step < nb_steps:
                if observation1 is None or observation2 is None:  # start of a new episode  

                    callbacks.on_episode_begin(episode)
                    episode_step = 0
                    episode_reward1 = 0.

                    # Obtain the initial observation by resetting the environment.
                    self.reset_states()
                    observation1 = deepcopy(env.reset())     
                    observation2 = deepcopy(env_1.reset())     

                    if self.actor_processor is not None:
                        # observation1 = self.learner_processor.process_observation(observation1)
                        observation1 = self.actor_processor.process_observation(observation1)
                        observation2 = self.actor_processor.process_observation(observation2)
                    assert observation1 is not None
                    assert observation2 is not None

                    # Perform random starts at beginning of episode and do not record them into the experience.
                    # This slightly changes the start position between games.
                    nb_random_start_steps = 0 if nb_max_start_steps == 0 else np.random.randint(nb_max_start_steps)
                    for _ in range(nb_random_start_steps):
                        if start_step_policy is None:
                            action1 = env.action_space.sample()
                            action2 = env_1.action_space.sample()
                        else:
                            action1 = start_step_policy(observation1)
                            action2 = start_step_policy(observation2)
                        if self.actor_processor is not None:
                            # action1 = self.learner_processor.process_action(action1)
                            action1 = self.actor_processor.process_action(action1)
                            action2 = self.actor_processor.process_action(action2)
                        callbacks.on_action_begin(action1)
                        observation1, reward1, done1, info1 = env.step(action1)
                        observation2, reward2, done2, info2 = env_1.step(action2)
                        observation1 = deepcopy(observation1)
                        observation2 = deepcopy(observation2)
                        if self.actor_processor is not None:
                            # observation1, reward1, done1, info1 = self.learner_processor.process_step(observation1, reward1, done1, info1)
                            observation1, reward1, done1, info1 = self.actor_processor.process_step(observation1, reward1, done1, info1)
                            observation2, reward2, done2, info2 = self.actor_processor.process_step(observation2, reward2, done2, info2)
                        callbacks.on_action_end(action1)
                        if done1:
                            warnings.warn('Env ended before {} random steps could be performed at the start. You should probably lower the `nb_max_start_steps` parameter.'.format(nb_random_start_steps))
                            observation1 = deepcopy(env.reset())
                            # if self.learner_processor is not None:
                            if self.actor_processor is not None:
                                # observation1 = self.learner_processor.process_observation(observation1)
                                observation1 = self.actor_processor.process_observation(observation1)
                            break
                        if done2:
                            warnings.warn('Env ended before {} random steps could be performed at the start. You should probably lower the `nb_max_start_steps` parameter.'.format(nb_random_start_steps))
                            observation2 = deepcopy(env_1.reset())
                            if self.actor_processor is not None:
                                observation2 = self.actor_processor.process_observation(observation2)
                            break

                # At this point, we expect to be fully initialized.
                assert episode_reward1 is not None
                assert episode_step is not None
                assert observation1 is not None
                assert observation2 is not None

                # Run a single step.
                callbacks.on_step_begin(episode_step)          # (Prints here if verbose = 1)
                # This is where all of the work happens. We first perceive and compute the action
                # (forward step) and then use the reward to improve (backward step).
                action1, action2 = self.forward(observation1, observation2)
                if self.actor_processor is not None:
                    # action1 = self.learner_processor.process_action(action1)
                    action1 = self.actor_processor.process_action(action1)
                    action2 = self.actor_processor.process_action(action2)
                reward1 = 0.
                reward2 = 0.
                accumulated_info = {}
                done1 = False
                done2 = False
                for _ in range(action_repetition):
                    callbacks.on_action_begin(action1)
                    observation1, r1, done1, info1 = env.step(action1)
                    observation1 = deepcopy(observation1)
                    # if self.learner_processor is not None:
                        # observation1, r1, done1, info1 = self.learner_processor.process_step(observation1, r1, done1, info1)
                    if self.actor_processor is not None:
                        observation1, r1, done1, info1 = self.actor_processor.process_step(observation1, r1, done1, info1)
                    for key, value in info1.items():
                        if not np.isreal(value):
                            continue
                        if key not in accumulated_info:
                            accumulated_info[key] = np.zeros_like(value)
                        accumulated_info[key] += value
                    callbacks.on_action_end(action1)             
                    reward1 += r1
                    if done1:
                        break

                for _ in range(action_repetition):
                    observation2, r2, done2, info2 = env_1.step(action2)
                    observation2 = deepcopy(observation2)
                    if self.actor_processor is not None:
                        observation2, r2, done2, info2 = self.actor_processor.process_step(observation2, r2, done2, info2)
                    reward2 += r2
                    if done2:
                        break

                if nb_max_episode_steps and episode_step >= nb_max_episode_steps - 1:
                    # Force a terminal state. (both agents take every step parallely)
                    done1 = True
                    done2 = True

                self.backward_actor(reward1, observation1, info1, reward2, observation2, info2, env, terminal1=done1, terminal2=done2)
                metrics = self.backward_learner()
                episode_reward1 += reward1

                step_logs = {
                    'action': action1,
                    'observation': observation1,
                    'reward': reward1,
                    'metrics': metrics,
                    'episode': episode,
                    'info': accumulated_info,
                }
                callbacks.on_step_end(episode_step, step_logs)      ## stores the current step info 

                if(self.step%update_exploration_interval and dynamic_actor_exploration==True):
                    self.update_actor_exploration()

                episode_step += 1
                self.step += 1

                if done1:

                    # This episode is finished, report and reset.
                    episode_logs = {
                        'episode_reward': episode_reward1,
                        'nb_episode_steps': episode_step,
                        'nb_steps': self.step,
                    }
                    callbacks.on_episode_end(episode, episode_logs)
                    # print("Episode: {}, Rewards: {}, Steps: {}".format(episode,episode_logs['episode_reward'],episode_logs['nb_episode_steps']))
                    episode += 1                ## CHECK!
                    observation1 = None
                    episode_step = None
                    episode_reward1 = None
                if done2:
                    observation2 = None


        except KeyboardInterrupt:
            # We catch keyboard interrupts here so that training can be be safely aborted.
            # This is so common that we've built this right into this function, which ensures that
            # the `on_train_end` method is properly called.
            did_abort = True
        callbacks.on_train_end(logs={'did_abort': did_abort})
        self._on_train_end()

        return history
Exemplo n.º 7
0
    def fit(self,
            env,
            nb_steps,
            action_repetition=1,
            callbacks=None,
            verbose=1,
            visualize=False,
            nb_max_start_steps=0,
            start_step_policy=None,
            log_interval=10000,
            nb_max_episode_steps=None,
            starting_checkpoints=[],
            avarage_q=None):
        """Trains the agent on the given environment.

        # Arguments
            env: (`Env` instance): Environment that the agent interacts with. See [Env](#env) for details.
            nb_steps (integer): Number of training steps to be performed.
            action_repetition (integer): Number of times the agent repeats the same action without
                observing the environment again. Setting this to a value > 1 can be useful
                if a single action only has a very small effect on the environment.
            callbacks (list of `keras.callbacks.Callback` or `rl.callbacks.Callback` instances):
                List of callbacks to apply during training. See [callbacks](/callbacks) for details.
            verbose (integer): 0 for no logging, 1 for interval logging (compare `log_interval`), 2 for episode logging
            visualize (boolean): If `True`, the environment is visualized during training. However,
                this is likely going to slow down training significantly and is thus intended to be
                a debugging instrument.
            nb_max_start_steps (integer): Number of maximum steps that the agent performs at the beginning
                of each episode using `start_step_policy`. Notice that this is an upper limit since
                the exact number of steps to be performed is sampled uniformly from [0, max_start_steps]
                at the beginning of each episode.
            start_step_policy (`lambda observation: action`): The policy
                to follow if `nb_max_start_steps` > 0. If set to `None`, a random action is performed.
            log_interval (integer): If `verbose` = 1, the number of steps that are considered to be an interval.
            nb_max_episode_steps (integer): Number of steps per episode that the agent performs before
                automatically resetting the environment. Set to `None` if each episode should run
                (potentially indefinitely) until the environment signals a terminal state.
            starting_checkpoints ([string]): starting checkpoints file names. When the enviroment is reset one
                checkpoint from the list will be drawn at random and enviroment will start from that exact checkpoint. 
                You can create the checkpoints using interactive_env.py.
            nb_max_episode_steps (dictionary): provide the options in order to messure avarage Q after the end of each
                episode. The metric will be added to the log as described at Playing Atari with Deep Reinforcement Learning.
                The start of the training may be delay as it takes some time to choose the evaluationg states. 
                You can either provide the two following options or a True boolean for using the defaults:
                    n_evaluations (integer): number of checkpoints to be evaluated and avaraged (default: 10).
                    bernoulli (float): bernoulli parameter. If succeed, the step will be chosen as a checkpoint.
                    The smaller this number the longer will take to select the checkpoints (default: 0.1).


        # Returns
            A `keras.callbacks.History` instance that recorded the entire training process.
        """
        if not self.compiled:
            raise RuntimeError(
                'Your tried to fit your agent but it hasn\'t been compiled yet. Please call `compile()` before `fit()`.'
            )
        if action_repetition < 1:
            raise ValueError('action_repetition must be >= 1, is {}'.format(
                action_repetition))

        self.training = True

        callbacks = [] if not callbacks else callbacks[:]

        if verbose == 1:
            callbacks += [TrainIntervalLogger(interval=log_interval)]
        elif verbose > 1:
            callbacks += [TrainEpisodeLogger()]
        if visualize:
            callbacks += [Visualizer()]
        history = History()
        callbacks += [history]
        callbacks = CallbackList(callbacks)
        if hasattr(callbacks, 'set_model'):
            callbacks.set_model(self)
        else:
            callbacks._set_model(self)
        callbacks._set_env(env)
        params = {
            'nb_steps': nb_steps,
        }
        if hasattr(callbacks, 'set_params'):
            callbacks.set_params(params)
        else:
            callbacks._set_params(params)
        self._on_train_begin()
        callbacks.on_train_begin()

        episode = 0
        self.step = 0
        observation = None
        episode_reward = None
        episode_step = None
        did_abort = False
        episode_beginning = True
        try:
            self.collect_avarage_q_checkpoints(env, avarage_q,
                                               starting_checkpoints)

            while self.step < nb_steps:
                if observation is None:  # start of a new episode
                    episode_beginning = True
                    callbacks.on_episode_begin(episode)
                    episode_step = 0
                    episode_reward = 0.

                    # Obtain the initial observation by resetting the environment.
                    self.reset_states()

                    if starting_checkpoints:
                        checkpoint = np.random.choice(starting_checkpoints)
                        observation = deepcopy(
                            env.reset(checkpoint='checkpoints/{}'.format(
                                checkpoint)))
                    else:
                        observation = deepcopy(env.reset())
                    if self.processor is not None:
                        observation = self.processor.process_observation(
                            observation)
                    assert observation is not None

                    # Perform random starts at beginning of episode and do not record them into the experience.
                    # This slightly changes the start position between games.
                    nb_random_start_steps = 0 if nb_max_start_steps == 0 else np.random.randint(
                        nb_max_start_steps)
                    for _ in xrange(nb_random_start_steps):
                        if start_step_policy is None:
                            action = env.action_space.sample()
                        else:
                            action = start_step_policy(observation)
                        if self.processor is not None:
                            action = self.processor.process_action(action)
                        callbacks.on_action_begin(action)
                        observation, reward, done, info = env.step(action)
                        observation = deepcopy(observation)
                        if self.processor is not None:
                            observation, reward, done, info = self.processor.process_step(
                                observation, reward, done, info)
                        callbacks.on_action_end(action)
                        if done:
                            warnings.warn(
                                'Env ended before {} random steps could be performed at the start. You should probably lower the `nb_max_start_steps` parameter.'
                                .format(nb_random_start_steps))
                            observation = deepcopy(env.reset())
                            if self.processor is not None:
                                observation = self.processor.process_observation(
                                    observation)
                            break

                # At this point, we expect to be fully initialized.
                assert episode_reward is not None
                assert episode_step is not None
                assert observation is not None

                # Run a single step.
                callbacks.on_step_begin(episode_step)
                # This is were all of the work happens. We first perceive and compute the action
                # (forward step) and then use the reward to improve (backward step).
                action = self.forward(observation)
                if self.processor is not None:
                    action = self.processor.process_action(action)
                reward = 0.
                accumulated_info = {}
                done = False

                # NOTA-EZE: Esto agrega complejidad al pe*o. El frameskip lo implementamos en el emulador
                for _ in xrange(action_repetition):
                    callbacks.on_action_begin(action)
                    observation, r, done, info = env.step(action)
                    observation = deepcopy(observation)
                    if self.processor is not None:
                        observation, r, done, info = self.processor.process_step(
                            observation, r, done, info)
                    # for key, value in info.items():
                    #     if not np.isreal(value):
                    #         continue
                    #     if key not in accumulated_info:
                    #         accumulated_info[key] = np.zeros_like(value)
                    #     accumulated_info[key] += value
                    callbacks.on_action_end(action)
                    reward += r
                    if done:
                        break
                if nb_max_episode_steps and episode_step >= nb_max_episode_steps - 1:
                    # Force a terminal state.
                    done = True

                # if self.memory.__class__.__name__ == 'PrioritizedMemory':
                #     self.memory.append_with_error(observation, action, reward, done, episode_beginning)

                metrics = self.backward(reward, terminal=done)
                episode_reward += reward

                step_logs = {
                    'action': action,
                    'observation': observation,
                    'reward': reward,
                    'metrics': metrics,
                    'episode': episode,
                    'info': accumulated_info,
                }
                callbacks.on_step_end(episode_step, step_logs)
                episode_step += 1
                self.step += 1

                if done:
                    # We are in a terminal state but the agent hasn't yet seen it. We therefore
                    # perform one more forward-backward call and simply ignore the action before
                    # resetting the environment. We need to pass in `terminal=False` here since
                    # the *next* state, that is the state of the newly reset environment, is
                    # always non-terminal by convention.
                    self.forward(observation)

                    # if self.memory.__class__.__name__ == 'PrioritizedMemory':
                    #     self.memory.append_with_error(observation)
                    # if self.memory.__class__.__name__ == 'EfficientPriorizatedMemory':
                    #     self.memory.append(observation)

                    self.backward(0., terminal=False)

                    # This episode is finished, report and reset.
                    episode_logs = {
                        'episode_reward': episode_reward,
                        'nb_episode_steps': episode_step,
                        'nb_steps': self.step,
                        'global_score': info["global_score"]
                    }

                    if self.memory.is_prioritized():
                        episode_logs['max_error_PER'] = self.memory.maximum
                        episode_logs['average_error_PER'] = self.memory.average
                        self.memory.reset_metrics()

                    if starting_checkpoints:
                        episode_logs['checkpoint'] = checkpoint

                    callbacks.on_episode_end(episode, episode_logs)
                    episode += 1
                    observation = None
                    episode_step = None
                    episode_reward = None
        except KeyboardInterrupt:
            # We catch keyboard interrupts here so that training can be be safely aborted.
            # This is so common that we've built this right into this function, which ensures that
            # the `on_train_end` method is properly called.
            did_abort = True
        callbacks.on_train_end(logs={'did_abort': did_abort})
        self._on_train_end()

        return history
    def fit(self,
            env,
            nb_steps,
            action_repetition=1,
            callbacks=None,
            verbose=1,
            visualize=False,
            nb_max_start_steps=0,
            start_step_policy=None,
            log_interval=10000,
            nb_max_episode_steps=None):
        if not self.compiled:
            raise RuntimeError(
                'Your tried to fit your agent but it hasn\'t been compiled yet. Please call `compile()` before `fit()`.'
            )
        if action_repetition < 1:
            raise ValueError('action_repetition must be >= 1, is {}'.format(
                action_repetition))

        self.training = True
        self.nb_steps = nb_steps
        callbacks = [] if not callbacks else callbacks[:]

        if verbose == 1:
            callbacks += [TrainIntervalLogger(interval=log_interval)]
        elif verbose > 1:
            callbacks += [TrainEpisodeLogger()]
        if visualize:
            callbacks += [Visualizer()]
        history = History()
        callbacks += [history]
        callbacks = CallbackList(callbacks)
        callbacks._set_model(self)
        callbacks._set_env(env)
        callbacks._set_params({
            'nb_steps': nb_steps,
        })
        self._on_train_begin()
        callbacks.on_train_begin()

        episode = 0
        self.step = 0
        observation = None
        episode_reward = None
        episode_step = None
        did_abort = False
        t = Thread(target=self.backward, args=[0, False])
        t.start()
        try:
            # maker sure forwad and backward are in the same graph
            with self.sess.graph.as_default():
                #while self.step < nb_steps:
                while self.back_step < nb_steps:
                    if self.backward_start_flag == True:
                        print "start"
                        continue
                    if observation is None:  # start of a new episode
                        callbacks.on_episode_begin(episode)
                        episode_step = 0
                        episode_reward = 0.

                        # Obtain the initial observation by resetting the environment.
                        self.reset_states()
                        observation = deepcopy(env.reset())
                        if self.processor is not None:
                            observation = self.processor.process_observation(
                                observation)
                        assert observation is not None

                        # Perform random starts at beginning of episode and do not record them into the experience.
                        # This slightly changes the start position between games.
                        nb_random_start_steps = 0 if nb_max_start_steps == 0 else np.random.randint(
                            nb_max_start_steps)
                        for _ in range(nb_random_start_steps):
                            if start_step_policy is None:
                                action = env.action_space.sample()
                            else:
                                action = start_step_policy(observation)
                            callbacks.on_action_begin(action)
                            observation, reward, done, info = env.step(action)
                            observation = deepcopy(observation)
                            if self.processor is not None:
                                observation, reward, done, info = self.processor.process_step(
                                    observation, reward, done, info)
                            callbacks.on_action_end(action)
                            if done:
                                warnings.warn(
                                    'Env ended before {} random steps could be performed at the start. You should probably lower the `nb_max_start_steps` parameter.'
                                    .format(nb_random_start_steps))
                                observation = deepcopy(env.reset())
                                if self.processor is not None:
                                    observation = self.processor.process_observation(
                                        observation)
                                print "observation shape: ", observatoin.shape
                                break

                    # At this point, we expect to be fully initialized.
                    assert episode_reward is not None
                    assert episode_step is not None
                    assert observation is not None

                    # Run a single step.
                    callbacks.on_step_begin(episode_step)
                    # This is were all of the work happens. We first perceive and compute the action
                    # (forward step) and then use the reward to improve (backward step).
                    K.manual_variable_initialization(True)
                    action = self.forward(observation)
                    #print "forward step show: ", self.step
                    #print "forward weights: ", self.sim_forward_actor.get_weights()[0]
                    #time.sleep(0.01)
                    K.manual_variable_initialization(False)
                    reward = 0.
                    accumulated_info = {}
                    done = False
                    for _ in range(action_repetition):
                        callbacks.on_action_begin(action)
                        observation, r, done, info = env.step(action)
                        observation = deepcopy(observation)
                        if self.processor is not None:
                            observation, r, done, info = self.processor.process_step(
                                observation, r, done, info)
                        for key, value in info.items():
                            if not np.isreal(value):
                                continue
                            if key not in accumulated_info:
                                accumulated_info[key] = np.zeros_like(value)
                            accumulated_info[key] += value
                        callbacks.on_action_end(action)
                        reward += r
                        if done:
                            break
                    if nb_max_episode_steps and episode_step >= nb_max_episode_steps - 1:
                        # Force a terminal state.
                        done = True

                    # Store most recent experience in memory.
                    if self.step % self.memory_interval == 0:
                        self.memory.append(self.recent_observation,
                                           self.recent_action,
                                           reward,
                                           done,
                                           training=self.training)

                    episode_reward += reward

                    step_logs = {
                        'action': action,
                        'observation': observation,
                        'reward': reward,
                        'metrics': self.metrics,
                        'episode': episode,
                        'info': accumulated_info,
                    }
                    callbacks.on_step_end(episode_step, step_logs)
                    episode_step += 1
                    self.step += 1
                    if done:
                        # We are in a terminal state but the agent hasn't yet seen it. We therefore
                        # perform one more forward-backward call and simply ignore the action before
                        # resetting the environment. We need to pass in `terminal=False` here since
                        # the *next* state, that is the state of the newly reset environment, is
                        # always non-terminal by convention.
                        self.forward(observation)
                        #self.backward(0., terminal=False)
                        if self.step % self.memory_interval == 0:
                            self.memory.append(self.recent_observation,
                                               self.recent_action,
                                               0,
                                               False,
                                               training=self.training)

                        # This episode is finished, report and reset.
                        episode_logs = {
                            'episode_reward': episode_reward,
                            'nb_episode_steps': episode_step,
                            'nb_steps': self.step,
                        }
                        callbacks.on_episode_end(episode, episode_logs)

                        episode += 1
                        observation = None
                        episode_step = None
                        episode_reward = None
        except KeyboardInterrupt:
            # We catch keyboard interrupts here so that training can be be safely aborted.
            # This is so common that we've built this right into this function, which ensures that
            # the `on_train_end` method is properly called.
            did_abort = True
        callbacks.on_train_end(logs={'did_abort': did_abort})
        self._on_train_end()

        return history
Exemplo n.º 9
0
    def _run(self,
             env,
             nb_steps=None,
             nb_episodes=None,
             train=True,
             exploration=True,
             action_repetition=1,
             callbacks=None,
             verbose=1,
             render=False,
             nb_max_start_steps=0,
             start_step_policy=None,
             log_interval=10000,
             nb_max_episode_steps=None,
             reward_scaling=1.,
             plots=False,
             tensorboard=False,
             **kwargs):
        """
        Run steps until termination.
        This method shouldn't be called directly, but instead called in :func:`fit` and :func:`test`

        Termination can be either:

        * Maximal number of steps
        * Maximal number of episodes

        :param nb_steps: Number of steps before termination.
        :param nb_episodes: Number of episodes before termination.
        :param bool training: Whether to train or test the agent. Not available for the :func:`fit` and :func:`test` methods.
        :param int action_repetition: Number of times the action is repeated for each step.
        :param callbacks:
        :param int verbose: 0 for no logging, 1 for interval logging (compare `log_interval`), 2 for episode logging
        :param bool visualize: Render the environment in realtime. This slows down by a big factor (up to 100) the function.
        :param nb_max_start_steps:
        :param start_step_policy: (`lambda observation: action`): The policy to follow if `nb_max_start_steps` > 0. If set to `None`, a random action is performed.
        :param log_interval:
        :param reward_scaling:
        :param plots: Plot metrics during training.
        :param tensorboard: Export metrics to tensorboard.
        """
        if not self.compiled:
            raise RuntimeError(
                'Your tried to fit your agent but it hasn\'t been compiled yet. Please call `compile()` before `train()`.'
            )
        if action_repetition < 1:
            raise ValueError('action_repetition must be >= 1, is {}'.format(
                action_repetition))

        # Process the different cases when either nb_steps or nb_episodes are specified
        if (nb_steps is None and nb_episodes is None):
            raise (ValueError(
                "Please specify one (and only one) of nb_steps or nb_episodes")
                   )
        elif (nb_steps is not None and nb_episodes is None):
            termination_criterion = STEPS_TERMINATION
        elif (nb_steps is None and nb_episodes is not None):
            termination_criterion = EPISODES_TERMINATION
        elif (nb_steps is not None and nb_episodes is not None):
            raise (ValueError(
                "Please specify one (and only one) of nb_steps or nb_episodes")
                   )

        self.training = train
        # We explore only if the flag is selected and we are in train mode
        self.exploration = (train and exploration)

        # Initialize callbacks
        if callbacks is None:
            callbacks = []
        if self.training:
            if verbose == 1:
                callbacks += [TrainIntervalLogger(interval=log_interval)]
            elif verbose > 1:
                callbacks += [TrainEpisodeLogger()]
        else:
            if verbose >= 1:
                callbacks += [TestLogger()]
        callbacks = [] if not callbacks else callbacks[:]
        if render:
            callbacks += [Visualizer()]
        history = History()
        callbacks += [history]
        callbacks = CallbackList(callbacks)
        if hasattr(callbacks, 'set_model'):
            callbacks.set_model(self)
        else:
            callbacks._set_model(self)
        callbacks._set_env(env)
        if termination_criterion == STEPS_TERMINATION:
            params = {
                'nb_steps': nb_steps,
            }
        elif termination_criterion == EPISODES_TERMINATION:
            params = {
                'nb_episodes': nb_episodes,
                'nb_steps': 1,
            }
        if hasattr(callbacks, 'set_params'):
            callbacks.set_params(params)
        else:
            callbacks._set_params(params)

        # Add run hooks
        if tensorboard:
            from rl.hooks.tensorboard import TensorboardHook
            self.hooks.append(TensorboardHook(agent_id=self.id))
        if plots:
            from rl.hooks.plot import PortraitHook, TrajectoryHook
            self.hooks.append(PortraitHook(agent_id=self.id))
            self.hooks.append(TrajectoryHook(agent_id=self.id))

        # Define the termination criterion
        # Step and episode at which we satrt the function
        start_step = self.step
        start_episode = self.episode
        if termination_criterion == STEPS_TERMINATION:

            def termination():
                return (self.step - start_step >= nb_steps)
        elif termination_criterion == EPISODES_TERMINATION:

            def termination():
                return ((self.episode - start_episode >= nb_episodes
                         and self.done))

        if self.training:
            self._on_train_begin()
        else:
            self._on_test_begin()

        callbacks.on_train_begin()

        # Setup
        self.run_number += 1
        self.run_done = False
        self.done = True
        did_abort = False
        # Define these for clarification, not mandatory:
        # Where observation: Observation before the step
        # observation_1: Observation after the step
        self.observation = None
        self.observation_1 = None
        self.action = None
        self.step_summaries = None

        # Run_init hooks
        self.hooks.run_init()

        # Run steps (and episodes) until the termination criterion is met
        while not (self.run_done):

            # Init episode
            # If we are at the beginning of a new episode, execute a startup sequence
            if self.done:
                self.episode += 1
                if self.training:
                    self.training_episode += 1
                self.episode_reward = 0.
                self.episode_step = 0
                callbacks.on_episode_begin(self.episode)

                # Obtain the initial observation by resetting the environment.
                self.reset_states()
                observation_0 = deepcopy(env.reset())
                assert observation_0 is not None

                # Perform random steps at beginning of episode and do not record them into the experience.
                # This slightly changes the start position between games.
                if nb_max_start_steps != 0:
                    observation_0 = self._perform_random_steps(
                        nb_max_start_steps, start_step_policy, env,
                        observation_0, callbacks)

            else:
                # We are in the middle of an episode
                # Update the observation
                observation_0 = self.observation_1
                # Increment the episode step

            # FIXME: Use only one of the two variables
            self.observation = observation_0

            # Increment the current step in both cases
            self.step += 1
            if self.training:
                self.training_step += 1
            self.episode_step += 1
            self.reward = 0.
            self.step_summaries = []
            accumulated_info = {}

            # Run a single step.
            callbacks.on_step_begin(self.episode_step)
            # This is were all of the work happens. We first perceive and compute the action
            # (forward step) and then use the reward to improve (backward step).

            # state_0 -- (foward) --> action
            self.action = self.forward(self.observation)

            # action -- (step) --> (reward, state_1, terminal)
            # Apply the action
            # With repetition, if necesarry
            for _ in range(action_repetition):
                callbacks.on_action_begin(self.action)
                self.observation_1, r, self.done, info = env.step(self.action)
                # observation_1 = deepcopy(observation_1)

                for key, value in info.items():
                    if not np.isreal(value):
                        continue
                    if key not in accumulated_info:
                        accumulated_info[key] = np.zeros_like(value)
                    accumulated_info[key] += value
                callbacks.on_action_end(self.action)

                self.reward += r

                # Set episode as finished if the environment has terminated
                if self.done:
                    break

            # Scale the reward
            self.reward = self.reward * reward_scaling
            self.episode_reward += self.reward

            # End of the step
            # Stop episode if reached the step limit
            if nb_max_episode_steps and self.episode_step >= nb_max_episode_steps:
                # Force a terminal state.
                self.done = True

            # Post step: training, callbacks and hooks
            # Train the algorithm
            self.backward()

            # step_end Hooks
            self.hooks()

            # Callbacks
            # Collect statistics
            step_logs = {
                'action': self.action,
                'observation': self.observation_1,
                'reward': self.reward,
                # For legacy callbacks upport
                'metrics': [],
                'episode': self.episode,
                'info': accumulated_info,
            }
            callbacks.on_step_end(self.episode_step, step_logs)

            # Episodic callbacks
            if self.done:
                # Collect statistics
                episode_logs = {
                    'episode_reward': np.float_(self.episode_reward),
                    'nb_episode_steps': np.float_(self.episode_step),
                    'nb_steps': np.float_(self.step),
                }
                callbacks.on_episode_end(self.episode, logs=episode_logs)
                self.hooks.episode_end()

            # Stop run if termination criterion met
            if termination():
                self.run_done = True

        callbacks.on_train_end(logs={'did_abort': did_abort})
        self._on_train_end()
        self.hooks.run_end()

        return (history)
Exemplo n.º 10
0
    def fit(self, env, nb_steps, action_repetition=1, callbacks=None, verbose=1,
            visualize=False, nb_max_start_steps=0, start_step_policy=None, log_interval=10000,
            nb_max_episode_steps=None):

        if not (self.agent1.compiled and self.agent2.compiled):
            raise RuntimeError(
                'Your tried to fit your agent but it hasn\'t been compiled yet. Please call `compile()` before `fit()`.')
        if action_repetition < 1:
            raise ValueError('action_repetition must be >= 1, is {}'.format(action_repetition))

        self.agent1.training = True
        self.agent2.training = True

        callbacks = [] if not callbacks else callbacks[:]

        if verbose == 1:
            callbacks += [TrainIntervalLogger(interval=log_interval)]
        elif verbose > 1:
            callbacks += [TrainEpisodeLogger()]
        if visualize:
            callbacks += [Visualizer()]
        history = History()
        callbacks += [history]
        callbacks = CallbackList(callbacks)
        if hasattr(callbacks, 'set_model'):
            callbacks.set_model(self)
        else:
            callbacks._set_model(self)
        callbacks._set_env(env)
        params = {
            'nb_steps': nb_steps,
        }
        if hasattr(callbacks, 'set_params'):
            callbacks.set_params(params)
        else:
            callbacks._set_params(params)
        self.agent1._on_train_begin()
        self.agent2._on_train_begin()
        callbacks.on_train_begin()

        episode = np.int16(0)
        self.agent1.step = np.int16(0)
        self.agent2.step = np.int16(0)
        observation = None
        episode_reward1 = None
        episode_reward2 = None
        episode_step = None
        did_abort = False
        try:
            while self.agent1.step < nb_steps:  # not individual for now
                if observation is None:  # start of a new episode
                    callbacks.on_episode_begin(episode)
                    episode_step = np.int16(0)
                    episode_reward1 = np.float32(0)
                    episode_reward2 = np.float32(0)

                    # Obtain the initial observation by resetting the environment.
                    self.agent1.reset_states()
                    self.agent2.reset_states()
                    observation = deepcopy(env.reset())
                    if self.agent1.processor is not None:  # not individual for now
                        observation = self.agent1.processor.process_observation(observation)
                    assert observation is not None

                    # Perform random starts at beginning of episode and do not record them into the experience.
                    # This slightly changes the start position between games.
                    nb_random_start_steps = 0 if nb_max_start_steps == 0 else np.random.randint(nb_max_start_steps)
                    for _ in range(nb_random_start_steps):
                        if start_step_policy is None:
                            action = env.action_space.sample()
                        else:
                            action = start_step_policy(observation)
                        if self.agent1.processor is not None:  # not individual for now. action is not from agent anyway
                            action = self.agent1.processor.process_action(action)
                        callbacks.on_action_begin(action)
                        observation, reward, done, info = env.step(action)
                        observation = deepcopy(observation)
                        if self.agent1.processor is not None:
                            observation, reward, done, info = self.agent1.processor.process_step(observation, reward,
                                                                                                 done, info)
                        callbacks.on_action_end(action)
                        if done:
                            warnings.warn(
                                'Env ended before {} random steps could be performed at the start. '
                                'You should probably lower the `nb_max_start_steps` parameter.'.format(
                                    nb_random_start_steps))
                            observation = deepcopy(env.reset())
                            if self.agent1.processor is not None:
                                observation = self.agent1.processor.process_observation(observation)
                            break

                # At this point, we expect to be fully initialized.
                assert episode_reward1 is not None
                assert episode_reward2 is not None
                assert episode_step is not None
                assert observation is not None

                # Run a single step.
                callbacks.on_step_begin(episode_step)
                # This is were all of the work happens. We first perceive and compute the action
                # (forward step) and then use the reward to improve (backward step).
                action1 = self.agent1.forward(observation)
                action2 = self.agent2.forward(observation)
                if self.agent1.processor is not None:
                    action1 = self.agent1.processor.process_action(action1)
                if self.agent2.processor is not None:
                    action2 = self.agent2.processor.process_action(action2)
                action = (np.ndarray.item(action1), np.ndarray.item(action2))
                reward1 = np.float32(0)
                reward2 = np.float32(0)
                reward = np.float32(0)
                accumulated_info = {}
                done = False
                for _ in range(action_repetition):
                    callbacks.on_action_begin(action)  # Use only one of the actions? added actions?
                    observation, r, done, info = env.step(action)
                    observation = deepcopy(observation)
                    if self.agent1.processor is not None:
                        observation, r, done, info = self.agent1.processor.process_step(observation, r, done, info)
                    for key, value in info.items():
                        if not np.isreal(value):
                            continue
                        if key not in accumulated_info:
                            accumulated_info[key] = np.zeros_like(value)
                        accumulated_info[key] += value
                    callbacks.on_action_end(action)
                    reward1 += info["r1"]
                    reward2 += info["r2"]
                    reward += info["r1"] + info["r2"]
                    if done:
                        break
                if nb_max_episode_steps and episode_step >= nb_max_episode_steps - 1:
                    # Force a terminal state.
                    done = True
                metrics1 = self.agent1.backward(reward1, terminal=done)
                metrics2 = self.agent2.backward(reward2, terminal=done)
                episode_reward1 += reward1
                episode_reward2 += reward2

                step_logs = {
                    'action': action,
                    'observation': observation,
                    'reward': reward,
                    'metrics': metrics1,  # not individual for now
                    'episode': episode,
                    'info': accumulated_info,
                }
                callbacks.on_step_end(episode_step, step_logs)
                episode_step += 1
                self.agent1.step += 1
                self.agent2.step += 1

                if done:
                    # We are in a terminal state but the agent hasn't yet seen it. We therefore
                    # perform one more forward-backward call and simply ignore the action before
                    # resetting the environment. We need to pass in `terminal=False` here since
                    # the *next* state, that is the state of the newly reset environment, is
                    # always non-terminal by convention.
                    self.agent1.forward(observation)
                    self.agent2.forward(observation)
                    self.agent1.backward(0., terminal=False)
                    self.agent2.backward(0., terminal=False)

                    # This episode is finished, report and reset.
                    episode_logs = {
                        'episode_reward': episode_reward1 + episode_reward2,
                        'nb_episode_steps': episode_step,
                        'nb_steps': self.agent1.step,  # not individual for now
                    }
                    callbacks.on_episode_end(episode, episode_logs)

                    episode += 1
                    observation = None
                    episode_step = None
                    episode_reward1 = None
                    episode_reward2 = None
        except KeyboardInterrupt:
            # We catch keyboard interrupts here so that training can be be safely aborted.
            # This is so common that we've built this right into this function, which ensures that
            # the `on_train_end` method is properly called.
            did_abort = True
        callbacks.on_train_end(logs={'did_abort': did_abort})
        self.agent1._on_train_end()
        self.agent2._on_train_end()


        return history
Exemplo n.º 11
0
    def fit(self,
            env,
            nb_episodes,
            min_steps,
            action_repetition=1,
            callbacks=None,
            verbose=1,
            visualize=False,
            nb_max_start_steps=0,
            start_step_policy=None,
            log_interval=10000,
            nb_max_episode_steps=None):
        if not self.compiled:
            raise RuntimeError(
                'Your tried to fit your agent but it hasn\'t been compiled yet. Please call `compile()` before `fit()`.'
            )
        if action_repetition < 1:
            raise ValueError('action_repetition must be >= 1, is {}'.format(
                action_repetition))

        self.training = True

        callbacks = [] if not callbacks else callbacks[:]

        if verbose == 1:
            callbacks += [TrainIntervalLogger(interval=log_interval)]
        elif verbose > 1:
            callbacks += [myTrainEpisodeLogger(self)]
        if visualize:
            callbacks += [Visualizer()]
        history = History()
        #callbacks += [history]
        callbacks = CallbackList(callbacks)
        if hasattr(callbacks, 'set_model'):
            callbacks.set_model(self)
        else:
            callbacks._set_model(self)
        callbacks._set_env(env)
        params = {
            'nb_episodes': nb_episodes,
            'name': self.name,
        }
        if hasattr(callbacks, 'set_params'):
            callbacks.set_params(params)
        else:
            callbacks._set_params(params)
        self._on_train_begin()
        callbacks.on_train_begin()

        self.step = 0

        episode = 0
        while episode < nb_episodes or self.step < min_steps:
            callbacks.on_episode_begin(episode)
            episode_step = 0
            episode_reward = 0.
            self.reset_states()
            observation = deepcopy(env.reset())
            while True:
                callbacks.on_step_begin(episode_step)

                q_values = self.compute_q_values([observation
                                                  ])  # only for windows 1
                action = self.policy.select_action(q_values=q_values)

                self.recent_observation = observation
                self.recent_action = action

                callbacks.on_action_begin(action)
                observation, reward, done, info = env.step(action)
                observation = deepcopy(observation)

                callbacks.on_action_end(action)

                if nb_max_episode_steps and episode_step >= nb_max_episode_steps - 1:
                    done = True

                metrics = self.backward(reward, terminal=done)
                episode_reward += reward

                step_logs = {
                    'action': action,
                    'reward': reward,
                    'metrics': metrics,
                    'episode': episode,
                }
                callbacks.on_step_end(episode_step, step_logs)

                episode_step += 1
                self.step += 1
                self.total_step += 1

                if done:
                    self.policy.log_qvalue(q_values)
                    cur_maxq = self.qlogger.cur_maxq
                    self.q_values = q_values
                    if self.name == 'env':
                        displayQvalue(q_values)

                    self.reward_his.append(episode_reward)
                    self.max_reward = max(self.max_reward, episode_reward)

                    self.forward(observation)
                    self.backward(0., terminal=False)
                    break

            episode_logs = {
                'episode_reward': episode_reward,
                'nb_episode_steps': episode_step,
                'nb_steps': self.step,
                'q_value': q_values[action],
                'q_max': cur_maxq,
                'q_mean': np.mean(self.qlogger.mean_maxq)
            }
            callbacks.on_episode_end(episode, episode_logs)
            episode += 1

        callbacks.on_train_end(logs={'did_abort': False})
        self._on_train_end()

        return history
Exemplo n.º 12
0
    def fit(self, env, nb_steps, action_repetition=1, callbacks=None, verbose=1,
            visualize=False, nb_max_start_steps=0, start_step_policy=None, log_interval=10000,
            nb_max_episode_steps=None):
        """Trains the agent on the given environment.
        # Arguments
            env: (`Env` instance): Environment that the agent interacts with. See [Env](#env) for details.
            nb_steps (integer): Number of training steps to be performed.
            action_repetition (integer): Number of times the agent repeats the same action without
                observing the environment again. Setting this to a value > 1 can be useful
                if a single action only has a very small effect on the environment.
            callbacks (list of `keras.callbacks.Callback` or `rl.callbacks.Callback` instances):
                List of callbacks to apply during training. See [callbacks](/callbacks) for details.
            verbose (integer): 0 for no logging, 1 for interval logging (compare `log_interval`), 2 for episode logging
            visualize (boolean): If `True`, the environment is visualized during training. However,
                this is likely going to slow down training significantly and is thus intended to be
                a debugging instrument.
            nb_max_start_steps (integer): Number of maximum steps that the agent performs at the beginning
                of each episode using `start_step_policy`. Notice that this is an upper limit since
                the exact number of steps to be performed is sampled uniformly from [0, max_start_steps]
                at the beginning of each episode.
            start_step_policy (`lambda observation: action`): The policy
                to follow if `nb_max_start_steps` > 0. If set to `None`, a random action is performed.
            log_interval (integer): If `verbose` = 1, the number of steps that are considered to be an interval.
            nb_max_episode_steps (integer): Number of steps per episode that the agent performs before
                automatically resetting the environment. Set to `None` if each episode should run
                (potentially indefinitely) until the environment signals a terminal state.
        # Returns
            A `keras.callbacks.History` instance that recorded the entire training process.
        """
        if not (self.agent1.compiled and self.agent2.compiled):
            raise RuntimeError(
                'Your tried to fit your agent but it hasn\'t been compiled yet. Please call `compile()` before `fit()`.')
        if action_repetition < 1:
            raise ValueError('action_repetition must be >= 1, is {}'.format(action_repetition))

        assert self.processor is None  # Removed processors here for simplification. Not needed anyway
        assert nb_max_start_steps == 0  # Removed here for simplification. Not needed anyway
        assert action_repetition == 1  # Removed here for simplification. Not needed anyway

        self.agent1.training = True
        self.agent2.training = True

        experience_for_plotting = deque()

        callbacks = [] if not callbacks else callbacks[:]

        if verbose == 1:
            callbacks += [TrainIntervalLogger(interval=log_interval)]
        elif verbose > 1:
            callbacks += [TrainEpisodeLogger()]
        if visualize:
            callbacks += [Visualizer()]
        history = History()
        callbacks += [history]
        callbacks = CallbackList(callbacks)
        if hasattr(callbacks, 'set_model'):
            callbacks.set_model(self)
        else:
            callbacks._set_model(self)
        callbacks._set_env(env)
        params = {
            'nb_steps': nb_steps,
        }
        if hasattr(callbacks, 'set_params'):
            callbacks.set_params(params)
        else:
            callbacks._set_params(params)
        self.agent1._on_train_begin()
        self.agent2._on_train_begin()
        callbacks.on_train_begin()

        episode = np.int16(0)
        self.agent1.step = np.int16(0)
        self.agent2.step = np.int16(0)
        observation1 = observation2 = None
        episode_reward1 = None
        episode_reward2 = None
        episode_step = None
        did_abort = False
        try:
            while self.agent1.step < nb_steps:  # not individual for now
                if observation1 is None or observation2 is None:  # start of a new episode
                    callbacks.on_episode_begin(episode)
                    episode_step = np.int16(0)
                    episode_reward1 = np.float32(0)
                    episode_reward2 = np.float32(0)

                    # Obtain the initial observation by resetting the environment.
                    self.agent1.reset_states()
                    self.agent2.reset_states()
                    obs = env.reset()
                    observation1 = deepcopy(obs) + (0.,)
                    observation2 = deepcopy(obs) + (0.,)

                # At this point, we expect to be fully initialized.
                assert episode_reward1 is not None
                assert episode_reward2 is not None
                assert episode_step is not None
                assert observation1 is not None
                assert observation2 is not None

                # Run a single step.
                callbacks.on_step_begin(episode_step)
                # This is were all of the work happens. We first perceive and compute the action
                # (forward step) and then use the reward to improve (backward step).
                action1 = np.ndarray.item(self.agent1.forward(observation1))
                action2 = np.ndarray.item(self.agent2.forward(observation2))
                action = (action1, action2)
                reward1 = np.float32(0)
                reward2 = np.float32(0)
                accumulated_info = {}
                done = False

                callbacks.on_action_begin(action)  # Use only one of the actions? added actions?
                obs, r, done, info = env.step(action)
                if done:
                    raise AttributeError  # The episode was reset unexpectedly
                    # (see https://stackoverflow.com/questions/42787924/)

                observation1 = deepcopy(obs) + (info["u2_clipped"],)  # Add action other to the observation
                observation2 = deepcopy(obs) + (info["u1_clipped"],)
                for key, value in info.items():
                    if not np.isreal(value):
                        continue
                    if key not in accumulated_info:
                        accumulated_info[key] = np.zeros_like(value)
                    accumulated_info[key] += value
                callbacks.on_action_end(action)
                reward1 += info["r1"]
                reward2 += info["r2"]

                if nb_max_episode_steps and episode_step >= nb_max_episode_steps - 1:
                    # Force a terminal state.
                    done = True
                metrics1 = self.agent1.backward(reward1, terminal=done)
                metrics2 = self.agent2.backward(reward2, terminal=done)
                episode_reward1 += reward1
                episode_reward2 += reward2

                step_logs = {
                    'action': action[0] + action[1],
                    'observation': observation1,
                    'reward': reward1 + reward2,
                    'metrics': metrics1,  # not individual for now
                    'episode': episode,
                    'info': accumulated_info,
                }
                callbacks.on_step_end(episode_step, step_logs)
                episode_step += 1
                self.agent1.step += 1
                self.agent2.step += 1

                if len(obs) == 2:
                    experience_for_plotting.append((info["t"], obs, (info["u1_clipped"], info["u2_clipped"]), (0., 0.),
                                                    r, (info["r1"], info["r2"])))

                if done:
                    # We are in a terminal state but the agent hasn't yet seen it. We therefore
                    # perform one more forward-backward call and simply ignore the action before
                    # resetting the environment. We need to pass in `terminal=False` here since
                    # the *next* state, that is the state of the newly reset environment, is
                    # always non-terminal by convention.
                    self.agent1.forward(observation1)
                    self.agent2.forward(observation2)
                    self.agent1.backward(0., terminal=False)
                    self.agent2.backward(0., terminal=False)

                    # This episode is finished, report and reset.
                    episode_logs = {
                        'episode_reward': episode_reward1 + episode_reward2,
                        'nb_episode_steps': episode_step,
                        'nb_steps': self.agent1.step,  # not individual for now
                    }
                    callbacks.on_episode_end(episode, episode_logs)

                    episode += 1
                    observation1 = None
                    observation2 = None
                    episode_step = None
                    episode_reward1 = None
                    episode_reward2 = None
        except KeyboardInterrupt:
            # We catch keyboard interrupts here so that training can be be safely aborted.
            # This is so common that we've built this right into this function, which ensures that
            # the `on_train_end` method is properly called.
            did_abort = True
        callbacks.on_train_end(logs={'did_abort': did_abort})
        self.agent1._on_train_end()
        self.agent2._on_train_end()

        return experience_for_plotting
Exemplo n.º 13
0
    def test_hrl(self,
                 env,
                 nb_episodes=1,
                 callbacks=None,
                 visualize=True,
                 nb_max_episode_steps=None,
                 verbose=2,
                 model_path=None):

        if model_path is not None:
            self.load_weights(model_path)
        if not self.compiled:
            raise RuntimeError(
                'Your tried to test your agent but it hasn\'t been '
                'compiled yet. Please call `compile()` before `test()`.')

        self.training = False
        self.turn_left_agent.training = False
        self.go_straight_agent.training = False
        self.turn_right_agent.training = False
        self.step = np.int16(0)
        self.turn_left_agent.step = np.int16(0)
        self.go_straight_agent.step = np.int16(0)
        self.turn_right_agent.step = np.int16(0)

        callbacks = [] if not callbacks else callbacks[:]

        if verbose >= 1:
            callbacks += [TestLogger()]
        if visualize:
            callbacks += [Visualizer()]
        history = History()
        callbacks += [history]
        callbacks = CallbackList(callbacks)
        callbacks.set_model(self)
        callbacks._set_env(env)
        params = {
            'nb_episodes': nb_episodes,
        }
        callbacks.set_params(params)
        self._on_test_begin()
        callbacks.on_train_begin()
        for episode in range(nb_episodes):
            callbacks.on_episode_begin(episode)
            episode_reward = 0.
            episode_step = 0

            # Obtain the initial observation by resetting the environment.
            self.reset_states()

            def random_init_state(flag=True):
                init_state = [-800, -150 - 3.75 * 5 / 2, 5, 0]
                if flag:
                    x = np.random.random() * 1000 - 800
                    lane = np.random.choice([0, 1, 2, 3])
                    y_fn = lambda lane: \
                    [-150 - 3.75 * 7 / 2, -150 - 3.75 * 5 / 2, -150 - 3.75 * 3 / 2, -150 - 3.75 * 1 / 2][lane]
                    y = y_fn(lane)
                    v = np.random.random() * 25
                    heading = 0
                    init_state = [x, y, v, heading]
                return init_state

            observation = deepcopy(
                env.reset(init_state=random_init_state(flag=True)))
            assert observation is not None

            # Run the episode until we're done.
            done = False
            while not done:
                callbacks.on_step_begin(episode_step)

                action = self.forward(observation)
                action = self.processor.process_action(action)
                reward = 0.
                callbacks.on_action_begin(action)
                observation, reward, done, info = env.step(action)
                observation = deepcopy(observation)
                callbacks.on_action_end(action)
                if nb_max_episode_steps and episode_step >= nb_max_episode_steps - 1:
                    done = True
                self.backward(reward, terminal=done)
                episode_reward += reward

                step_logs = {
                    'action': action,
                    'observation': observation,
                    'reward': reward,
                    'episode': episode
                }
                callbacks.on_step_end(episode_step, step_logs)
                episode_step += 1
                self.step += 1
                self.turn_left_agent.step += 1
                self.go_straight_agent.step += 1
                self.turn_right_agent.step += 1

            # We are in a terminal state but the agent hasn't yet seen it. We therefore
            # perform one more forward-backward call and simply ignore the action before
            # resetting the environment. We need to pass in `terminal=False` here since
            # the *next* state, that is the state of the newly reset environment, is
            # always non-terminal by convention.
            self.forward(observation)
            self.backward(0., terminal=False)

            # Report end of episode.
            episode_logs = {
                'episode_reward': episode_reward,
                'nb_steps': episode_step,
            }
            callbacks.on_episode_end(episode, episode_logs)
        callbacks.on_train_end()
        self._on_test_end()

        return history
Exemplo n.º 14
0
    def fit_hrl(self,
                env,
                nb_steps,
                random_start_step_policy,
                callbacks=None,
                verbose=1,
                visualize=False,
                pre_warm_steps=0,
                log_interval=100,
                save_interval=1,
                nb_max_episode_steps=None):

        if not self.compiled:
            raise RuntimeError(
                'Your tried to fit your agent but it hasn\'t been'
                ' compiled yet. Please call `compile()` before `fit()`.')

        self.training = True
        self.turn_left_agent.training = True
        self.go_straight_agent.training = True
        self.turn_right_agent.training = True

        callbacks = [] if not callbacks else callbacks[:]

        if verbose == 1:
            callbacks += [TrainIntervalLogger(interval=log_interval)]
        elif verbose > 1:
            callbacks += [TrainEpisodeLogger()]
        if visualize:
            callbacks += [Visualizer()]

        parent_dir = os.path.dirname(os.path.dirname(__file__))
        callbacks += [FileLogger(filepath=parent_dir + os.sep + 'log.json')]
        callbacks += [
            ModelIntervalCheckpoint(filepath=parent_dir +
                                    '/checkpoints/model_step{step}.h5f',
                                    interval=save_interval,
                                    verbose=1)
        ]
        history = History()
        callbacks += [history]
        callbacks = CallbackList(callbacks)
        callbacks.set_model(self)
        callbacks._set_env(env)
        params = {
            'nb_steps': nb_steps,
        }
        callbacks.set_params(params)
        self._on_train_begin()
        callbacks.on_train_begin()

        episode = np.int16(0)
        self.step = np.int16(0)
        self.turn_left_agent.step = np.int16(0)
        self.go_straight_agent.step = np.int16(0)
        self.turn_right_agent.step = np.int16(0)
        observation = env.encoded_obs
        episode_reward = None
        episode_step = None
        did_abort = False

        # warm steps
        print('pre warming up:')
        for _ in range(pre_warm_steps):
            normed_action = random_start_step_policy()
            recent_action = normed_action
            recent_observation = observation  # put in normed action and unprocessed observation
            action = self.processor.process_action(
                recent_action)  # [0/1/2, goal_delta_x, acc]

            callbacks.on_action_begin(action)
            observation, reward, done, info = env.step(action)
            observation = deepcopy(observation)
            if self.processor is not None:
                observation, reward, done, info = self.processor.process_step(
                    observation, reward, done, info)
            callbacks.on_action_end(action)

            self.memory.append(recent_observation,
                               recent_action[0],
                               reward,
                               done,
                               training=self.training)
            if recent_action[0] == 0:
                left_obs = np.column_stack(
                    (recent_observation[:, :30], recent_observation[:, -8:],
                     np.tile(
                         np.array([1, 0, 0]),
                         (recent_observation.shape[0], 1))))  # 30 + 8 + 3 = 41
                lower_action = recent_action[1:]
                self.turn_left_agent.memory.append(left_obs,
                                                   lower_action,
                                                   reward,
                                                   1,
                                                   training=self.training)
            elif recent_action[0] == 1:
                straight_obs = np.column_stack(
                    (deepcopy(recent_observation),
                     np.tile(np.array([0, 1, 0]),
                             (recent_observation.shape[0], 1))))  # 56 + 3 = 59
                lower_action = recent_action[1:]
                self.go_straight_agent.memory.append(straight_obs,
                                                     lower_action,
                                                     reward,
                                                     1,
                                                     training=self.training)
            else:
                right_obs = np.column_stack(
                    (recent_observation[:, 18:],
                     np.tile(
                         np.array([0, 0, 1]),
                         (recent_observation.shape[0], 1))))  # 56- 18 + 3 = 41
                lower_action = recent_action[1:]
                self.turn_right_agent.memory.append(right_obs,
                                                    lower_action,
                                                    reward,
                                                    1,
                                                    training=self.training)
            print('————————————————————————————————————————')
            print({
                'upper_memory_len: ': self.memory.nb_entries,
                'left_memory_len: ': self.turn_left_agent.memory.nb_entries,
                'straight_memory_len: ':
                self.go_straight_agent.memory.nb_entries,
                'right_memory_len: ': self.turn_right_agent.memory.nb_entries
            })
            print('————————————————————————————————————————')
            # TODO: always has a point is not done, but there would be only one bad point in the buffer
            if done:

                def random_init_state(flag=True):
                    init_state = [-800, -150 - 3.75 * 5 / 2, 5, 0]
                    if flag:
                        x = np.random.random() * 1000 - 800
                        lane = np.random.choice([0, 1, 2, 3])
                        y_fn = lambda lane: \
                        [-150 - 3.75 * 7 / 2, -150 - 3.75 * 5 / 2, -150 - 3.75 * 3 / 2, -150 - 3.75 * 1 / 2][lane]
                        y = y_fn(lane)
                        v = np.random.random() * 25
                        heading = 0
                        init_state = [x, y, v, heading]
                    return init_state

                observation = deepcopy(
                    env.reset(init_state=random_init_state(flag=True)))
                if self.processor is not None:
                    observation = self.processor.process_observation(
                        observation)

        observation = None

        try:
            while self.step < nb_steps:
                if observation is None:  # start of a new episode
                    callbacks.on_episode_begin(episode)
                    episode_step = np.int16(0)
                    episode_reward = np.float32(0)

                    # Obtain the initial observation by resetting the environment.
                    self.reset_states()

                    def random_init_state(flag=True):
                        init_state = [-800, -150 - 3.75 * 5 / 2, 5, 0]
                        if flag:
                            x = np.random.uniform(0, 1) * 1000 - 800
                            lane = np.random.choice([0, 1, 2, 3])
                            y_fn = lambda lane: [
                                -150 - 3.75 * 7 / 2, -150 - 3.75 * 5 / 2, -150
                                - 3.75 * 3 / 2, -150 - 3.75 * 1 / 2
                            ][lane]
                            y = y_fn(lane)
                            v = np.random.uniform(0, 1) * 25
                            heading = 0
                            init_state = [x, y, v, heading]
                        return init_state

                    observation = deepcopy(
                        env.reset(init_state=random_init_state()))
                    if self.processor is not None:
                        observation = self.processor.process_observation(
                            observation)
                    assert observation is not None

                # At this point, we expect to be fully initialized.
                assert episode_reward is not None
                assert episode_step is not None
                assert observation is not None

                # Run a single step.
                callbacks.on_step_begin(episode_step)
                # This is were all of the work happens. We first perceive and compute the action
                # (forward step) and then use the reward to improve (backward step).
                action = self.forward(observation)  # this is normed action
                action = self.processor.process_action(
                    action)  # this is processed action for env
                done = False

                callbacks.on_action_begin(action)
                observation, reward, done, info = env.step(action)
                observation = deepcopy(observation)
                if self.processor is not None:
                    observation, reward, done, info = self.processor.process_step(
                        observation, reward, done, info)
                callbacks.on_action_end(action)

                if nb_max_episode_steps and episode_step >= nb_max_episode_steps - 1:
                    # Force a terminal state.
                    done = True
                metrics = self.backward(reward, terminal=done)
                episode_reward += reward
                step_logs = {
                    'action': action,  # processed action
                    'observation': observation,  # true obs
                    'reward': reward,
                    'metrics': metrics,
                    'episode': episode
                    # 'info': info,
                }

                callbacks.on_step_end(episode_step, step_logs)
                episode_step += 1
                self.step += 1
                self.turn_left_agent.step += 1
                self.go_straight_agent.step += 1
                self.turn_right_agent.step += 1

                memory_len = [
                    self.turn_left_agent.memory.nb_entries,
                    self.go_straight_agent.memory.nb_entries,
                    self.turn_right_agent.memory.nb_entries
                ]

                if done:
                    episode_logs = {
                        'episode_reward': episode_reward,
                        'nb_episode_steps': episode_step,
                        'nb_steps': self.step,
                        'memory_len': memory_len
                    }
                    callbacks.on_episode_end(episode, episode_logs)

                    episode += 1
                    observation = None
                    episode_step = None
                    episode_reward = None
        except KeyboardInterrupt:
            # We catch keyboard interrupts here so that training can be be safely aborted.
            # This is so common that we've built this right into this function, which ensures that
            # the `on_train_end` method is properly called.
            did_abort = True
        callbacks.on_train_end(logs={'did_abort': did_abort})
        self._on_train_end()

        return history
Exemplo n.º 15
0
    def fit(self, env, nb_steps, action_repetition=1, callbacks=None, verbose=1,
            visualize=False, nb_max_start_steps=0, start_step_policy=None, log_interval=10000,
            nb_max_episode_steps=None):
        """Trains the agent on the given environment.

        # Arguments
            env: (`Env` instance): Environment that the agent interacts with. See [Env](#env) for details.
            nb_steps (integer): Number of training steps to be performed.
            action_repetition (integer): Number of times the agent repeats the same action without
                observing the environment again. Setting this to a value > 1 can be useful
                if a single action only has a very small effect on the environment.
            callbacks (list of `keras.callbacks.Callback` or `rl.callbacks.Callback` instances):
                List of callbacks to apply during training. See [callbacks](/callbacks) for details.
            verbose (integer): 0 for no logging, 1 for interval logging (compare `log_interval`), 2 for episode logging
            visualize (boolean): If `True`, the environment is visualized during training. However,
                this is likely going to slow down training significantly and is thus intended to be
                a debugging instrument.
            nb_max_start_steps (integer): Number of maximum steps that the agent performs at the beginning
                of each episode using `start_step_policy`. Notice that this is an upper limit since
                the exact number of steps to be performed is sampled uniformly from [0, max_start_steps]
                at the beginning of each episode.
            start_step_policy (`lambda observation: action`): The policy
                to follow if `nb_max_start_steps` > 0. If set to `None`, a random action is performed.
            log_interval (integer): If `verbose` = 1, the number of steps that are considered to be an interval.
            nb_max_episode_steps (integer): Number of steps per episode that the agent performs before
                automatically resetting the environment. Set to `None` if each episode should run
                (potentially indefinitely) until the environment signals a terminal state.

        # Returns
            A `keras.callbacks.History` instance that recorded the entire training process.
        """
        if not self.compiled:
            raise RuntimeError('Your tried to fit your agent but it hasn\'t been compiled yet. Please call `compile()` before `fit()`.')
        if action_repetition < 1:
            raise ValueError('action_repetition must be >= 1, is {}'.format(action_repetition))

        self.training = True

        callbacks = [] if not callbacks else callbacks[:]

        if verbose == 1:
            callbacks += [TrainIntervalLogger(interval=log_interval)]
        elif verbose > 1:
            callbacks += [TrainEpisodeLogger()]
        if visualize:
            callbacks += [Visualizer()]
        history = History()
        callbacks += [history]
        callbacks = CallbackList(callbacks)
        if hasattr(callbacks, 'set_model'):
            callbacks.set_model(self)
        else:
            callbacks._set_model(self)
        callbacks._set_env(env)
        params = {
            'nb_steps': nb_steps,
        }
        if hasattr(callbacks, 'set_params'):
            callbacks.set_params(params)
        else:
            callbacks._set_params(params)
        self._on_train_begin()
        callbacks.on_train_begin()

        episode = 0
        self.step = 0
        observation = None
        episode_reward = None
        episode_step = None
        did_abort = False
        try:
            while self.step < nb_steps:
                if observation is None:  # start of a new episode
                    callbacks.on_episode_begin(episode)
                    episode_step = 0
                    episode_reward = 0.

                    # Obtain the initial observation by resetting the environment.
                    self.reset_states()
                    observation = deepcopy(env.reset())
                    if self.processor is not None:
                        observation = self.processor.process_observation(observation)
                    assert observation is not None

                    # Perform random starts at beginning of episode and do not record them into the experience.
                    # This slightly changes the start position between games.
                    nb_random_start_steps = 0 if nb_max_start_steps == 0 else np.random.randint(nb_max_start_steps)
                    for _ in range(nb_random_start_steps):
                        if start_step_policy is None:
                            action = env.action_space.sample()
                        else:
                            action = start_step_policy(observation)
                        if self.processor is not None:
                            action = self.processor.process_action(action)
                        callbacks.on_action_begin(action)
                        observation, reward, done, info = env.step(action)
                        observation = deepcopy(observation)
                        if self.processor is not None:
                            observation, reward, done, info = self.processor.process_step(observation, reward, done, info)
                        callbacks.on_action_end(action)
                        if done:
                            warnings.warn('Env ended before {} random steps could be performed at the start. You should probably lower the `nb_max_start_steps` parameter.'.format(nb_random_start_steps))
                            observation = deepcopy(env.reset())
                            if self.processor is not None:
                                observation = self.processor.process_observation(observation)
                            break

                # At this point, we expect to be fully initialized.
                assert episode_reward is not None
                assert episode_step is not None
                assert observation is not None

                # Run a single step.
                callbacks.on_step_begin(episode_step)
                # This is were all of the work happens. We first perceive and compute the action
                # (forward step) and then use the reward to improve (backward step).
                action = self.forward(observation)
                if self.processor is not None:
                    action = self.processor.process_action(action)
                reward = 0.
                accumulated_info = {}
                done = False
                for _ in range(action_repetition):
                    callbacks.on_action_begin(action)
                    observation, r, done, info = env.step(action)
                    observation = deepcopy(observation)
                    if self.processor is not None:
                        observation, r, done, info = self.processor.process_step(observation, r, done, info)
                    for key, value in info.items():
                        if not np.isreal(value):
                            continue
                        if key not in accumulated_info:
                            accumulated_info[key] = np.zeros_like(value)
                        accumulated_info[key] += value
                    callbacks.on_action_end(action)
                    reward += r
                    if done:
                        break
                if nb_max_episode_steps and episode_step >= nb_max_episode_steps - 1:
                    # Force a terminal state.
                    done = True
                metrics = self.backward(reward, terminal=done)
                episode_reward += reward

                step_logs = {
                    'action': action,
                    'observation': observation,
                    'reward': reward,
                    'metrics': metrics,
                    'episode': episode,
                    'info': accumulated_info,
                }
                callbacks.on_step_end(episode_step, step_logs)
                episode_step += 1
                self.step += 1

                if done:
                    # We are in a terminal state but the agent hasn't yet seen it. We therefore
                    # perform one more forward-backward call and simply ignore the action before
                    # resetting the environment. We need to pass in `terminal=False` here since
                    # the *next* state, that is the state of the newly reset environment, is
                    # always non-terminal by convention.
                    self.forward(observation)
                    self.backward(0., terminal=False)

                    # This episode is finished, report and reset.
                    episode_logs = {
                        'episode_reward': episode_reward,
                        'nb_episode_steps': episode_step,
                        'nb_steps': self.step,
                    }
                    callbacks.on_episode_end(episode, episode_logs)

                    episode += 1
                    observation = None
                    episode_step = None
                    episode_reward = None
        except KeyboardInterrupt:
            # We catch keyboard interrupts here so that training can be be safely aborted.
            # This is so common that we've built this right into this function, which ensures that
            # the `on_train_end` method is properly called.
            did_abort = True
        callbacks.on_train_end(logs={'did_abort': did_abort})
        self._on_train_end()

        return history
Exemplo n.º 16
0
    def fit(self, env, nb_steps, action_repetition=1, callbacks=None,
            verbose=1, visualize=False, nb_max_start_steps=0,
            start_step_policy=None, log_interval=10000,
            nb_max_episode_steps=None):
        if not self.compiled:
            raise RuntimeError(
                'Your tried to fit your agent but it hasn\'t been compiled yet. Please call `compile()` before `fit()`.')
        if action_repetition < 1:
            raise ValueError(
                'action_repetition must be >= 1, is {}'.format(action_repetition))

        self.training = True

        callbacks = [] if not callbacks else callbacks[:]

        # if verbose == 1:
        #     callbacks += [TrainIntervalLogger(interval=log_interval)]
        # elif verbose > 1:
        #     callbacks += [TrainEpisodeLogger()]
        if visualize:
            callbacks += [Visualizer()]
        history = History()
        callbacks += [history]
        callbacks = CallbackList(callbacks)
        if hasattr(callbacks, 'set_model'):
            callbacks.set_model(self)
        else:
            callbacks._set_model(self)
        callbacks._set_env(env)
        params = {
            'nb_steps': nb_steps,
        }
        if hasattr(callbacks, 'set_params'):
            callbacks.set_params(params)
        else:
            callbacks._set_params(params)
        self._on_train_begin()
        callbacks.on_train_begin()

        episode = 0
        self.step = 0
        observation = None
        episode_reward = None
        episode_step = None
        did_abort = False
        try:
            while self.step < nb_steps:
                if observation is None:  # start of a new episode
                    callbacks.on_episode_begin(episode)
                    episode_step = 0
                    episode_reward = 0.
                    self.reset_states()
                    observation = deepcopy(env.reset())
                    if self.processor is not None:
                        observation = self.processor.process_observation(
                            observation)
                    assert observation is not None
                    nb_random_start_steps = 0 if nb_max_start_steps == 0 \
                        else np.random.randint(nb_max_start_steps)
                    for _ in range(nb_random_start_steps):
                        if start_step_policy is None:
                            action = env.action_space.sample()
                        else:
                            action = start_step_policy(observation)
                        if self.processor is not None:
                            action = self.processor.process_action(action)
                        callbacks.on_action_begin(action)
                        observation, reward, done, info = env.step(
                            self._convert_action(action))
                        env.render()
                        while info.get('env_status.env_state') is None:
                            observation, r, done, info = env.step(
                                self._convert_action(action))

                            env.render()
                        observation = deepcopy(observation)
                        if self.processor is not None:
                            observation = \
                                self.processor.process_observation(observation)
                            reward = self.processor.process_reward(reward)
                            done = done[0]
                        callbacks.on_action_end(action)
                        if done:
                            warnings.warn('Env ended before {} random steps could be performed at the start. You should probably lower the `nb_max_start_steps` parameter.'.format(
                                nb_random_start_steps))
                            observation = deepcopy(env.reset())
                            if self.processor is not None:
                                observation = self.processor.process_observation(
                                    observation)
                            break

                # At this point, we expect to be fully initialized.
                assert episode_reward is not None
                assert episode_step is not None
                assert observation is not None

                # Run a single step.
                callbacks.on_step_begin(episode_step)
                K.set_learning_phase(1)
                action = self.forward(observation)
                if self.processor is not None:
                    action = self.processor.process_action(action)
                reward = 0.
                accumulated_info = {}
                done = False
                for _ in range(action_repetition):
                    callbacks.on_action_begin(action)
                    observation, r, done, info = env.step(
                        self._convert_action(action))
                    env.render()
                    # while info.get('n')[0].get('env_status.env_state') is None:
                    #     observation, r, done, info = env.step(
                    #         self._convert_action(action))
                    #     print(info)
                    #     print(info.get('env_status.env_state'))
                    #     env.render()
                    observation = deepcopy(observation)
                    if self.processor is not None:
                        observation, r, done, info = self.processor.process_step(
                            observation, r, done, info)
                        done = done[0]
                        print(r, done, info)
                    callbacks.on_action_end(action)
                    reward += r
                    if done:
                        break
                if nb_max_episode_steps and episode_step >= nb_max_episode_steps - 1:
                    # Force a terminal state.
                    done = True
                K.set_learning_phase(1)
                metrics = self.backward(reward, terminal=done)
                episode_reward += reward

                step_logs = {
                    'action': action,
                    'observation': observation,
                    'reward': reward,
                    'metrics': metrics,
                    'episode': episode
                }
                print(action, reward)
                callbacks.on_step_end(episode_step, step_logs)
                episode_step += 1
                self.step += 1

                if done:
                    K.set_learning_phase(1)
                    self.forward(observation)
                    K.set_learning_phase(1)
                    self.backward(0., terminal=False)

                    # This episode is finished, report and reset.
                    episode_logs = {
                        'episode_reward': episode_reward,
                        'nb_episode_steps': episode_step,
                        'nb_steps': self.step,
                    }
                    callbacks.on_episode_end(episode, episode_logs)

                    episode += 1
                    observation = None
                    episode_step = None
                    episode_reward = None
        except KeyboardInterrupt:
            did_abort = True
        callbacks.on_train_end(logs={'did_abort': did_abort})
        self._on_train_end()

        return history
Exemplo n.º 17
0
    def fit(self,
            env,
            nb_steps,
            action_repetition=1,
            callbacks=[],
            verbose=1,
            visualize=False,
            nb_max_start_steps=0,
            start_step_policy=None,
            log_interval=10000,
            nb_max_episode_steps=None):
        if not self.compiled:
            raise RuntimeError(
                'Your tried to fit your agent but it hasn\'t been compiled yet. Please call `compile()` before `fit()`.'
            )
        if action_repetition < 1:
            raise ValueError('action_repetition must be >= 1, is {}'.format(
                action_repetition))

        self.training = True

        if verbose == 1:
            callbacks += [TrainIntervalLogger(interval=log_interval)]
        elif verbose > 1:
            callbacks += [TrainEpisodeLogger()]
        if visualize:
            callbacks += [Visualizer()]
        callbacks = CallbackList(callbacks)
        callbacks._set_model(self)
        callbacks._set_env(env)
        callbacks._set_params({
            'nb_steps': nb_steps,
        })
        callbacks.on_train_begin()

        episode = 0
        self.step = 0
        observation = None
        episode_reward = None
        episode_step = None
        did_abort = False
        try:
            while self.step < nb_steps:
                if observation is None:  # start of a new episode
                    callbacks.on_episode_begin(episode)
                    episode_step = 0
                    episode_reward = 0.

                    # Obtain the initial observation by resetting the environment.
                    self.reset_states()
                    observation = env.reset()
                    assert observation is not None

                    # Perform random starts at beginning of episode and do not record them into the experience.
                    # This slightly changes the start position between games.
                    nb_random_start_steps = 0 if nb_max_start_steps == 0 else np.random.randint(
                        nb_max_start_steps)
                    for _ in range(nb_random_start_steps):
                        if start_step_policy is None:
                            action = env.action_space.sample()
                        else:
                            action = start_step_policy(observation)
                        callbacks.on_action_begin(action)
                        observation, _, done, _ = env.step(action)
                        callbacks.on_action_end(action)
                        if done:
                            warnings.warn(
                                'Env ended before {} random steps could be performed at the start. You should probably lower the `nb_max_start_steps` parameter.'
                                .format(nb_random_start_steps))
                            observation = env.reset()
                            break

                # At this point, we expect to be fully initialized.
                assert episode_reward is not None
                assert episode_step is not None
                assert observation is not None

                # Run a single step.
                callbacks.on_step_begin(episode_step)
                # This is were all of the work happens. We first perceive and compute the action
                # (forward step) and then use the reward to improve (backward step).
                action = self.forward(observation)
                reward = 0.
                done = False
                for _ in range(action_repetition):
                    callbacks.on_action_begin(action)
                    observation, r, done, _ = env.step(action)
                    callbacks.on_action_end(action)
                    reward += r
                    if done:
                        break
                metrics = self.backward(reward, terminal=done)
                episode_reward += reward

                step_logs = {
                    'action': action,
                    'observation': observation,
                    'reward': reward,
                    'metrics': metrics,
                    'episode': episode,
                }
                callbacks.on_step_end(episode_step, step_logs)
                episode_step += 1
                self.step += 1

                if done or (nb_max_episode_steps
                            and episode_step > nb_max_episode_steps):
                    # This episode is finished, report and reset.
                    episode_logs = {
                        'episode_reward': episode_reward,
                        'nb_episode_steps': episode_step,
                        'nb_steps': self.step,
                    }
                    callbacks.on_episode_end(episode, episode_logs)

                    episode += 1
                    observation = None
                    episode_step = None
                    episode_reward = None
        except KeyboardInterrupt:
            # We catch keyboard interrupts here so that training can be be safely aborted.
            # This is so common that we've built this right into this function, which ensures that
            # the `on_train_end` method is properly called.
            did_abort = True
        callbacks.on_train_end(logs={'did_abort': did_abort})
Exemplo n.º 18
0
    def fit(self,
            env,
            nb_steps,
            action_repetition=1,
            callbacks=None,
            verbose=1,
            visualize=False,
            nb_max_start_steps=0,
            start_step_policy=None,
            log_interval=10000,
            nb_max_episode_steps=None,
            version=None,
            custom_env=False):
        """Trains the agent on the given environment.

        # Arguments
            env: (`Env` instance): Environment that the agent interacts with. See [Env](#env) for details.
            nb_steps (integer): Number of training steps to be performed.
            action_repetition (integer): Number of times the agent repeats the same action without
                observing the environment again. Setting this to a value > 1 can be useful
                if a single action only has a very small effect on the environment.
            callbacks (list of `keras.callbacks.Callback` or `rl.callbacks.Callback` instances):
                List of callbacks to apply during training. See [callbacks](/callbacks) for details.
            verbose (integer): 0 for no logging, 1 for interval logging (compare `log_interval`), 2 for episode logging
            visualize (boolean): If `True`, the environment is visualized during training. However,
                this is likely going to slow down training significantly and is thus intended to be
                a debugging instrument.
            nb_max_start_steps (integer): Number of maximum steps that the agent performs at the beginning
                of each episode using `start_step_policy`. Notice that this is an upper limit since
                the exact number of steps to be performed is sampled uniformly from [0, max_start_steps]
                at the beginning of each episode.
            start_step_policy (`lambda observation: action`): The policy
                to follow if `nb_max_start_steps` > 0. If set to `None`, a random action is performed.
            log_interval (integer): If `verbose` = 1, the number of steps that are considered to be an interval.
            nb_max_episode_steps (integer): Number of steps per episode that the agent performs before
                automatically resetting the environment. Set to `None` if each episode should run
                (potentially indefinitely) until the environment signals a terminal state.

        # Returns
            A `keras.callbacks.History` instance that recorded the entire training process.
        """
        if not self.compiled:
            raise RuntimeError(
                'Your tried to fit your agent but it hasn\'t been compiled yet. Please call `compile()` before `fit()`.'
            )
        if action_repetition < 1:
            raise ValueError('action_repetition must be >= 1, is {}'.format(
                action_repetition))

        self.training = True
        #self.stop_training = False  # https://github.com/keras-rl/keras-rl/pull/170/commits/73c073d773ade1ccf70bdcd8f96237473b206ed8

        callbacks = [] if not callbacks else callbacks[:]

        if verbose == 1:
            callbacks += [TrainIntervalLogger(interval=log_interval)]
        elif verbose > 1:
            callbacks += [TrainEpisodeLogger()]
        if visualize:
            callbacks += [Visualizer()]
        history = History()  # get the history class
        callbacks += [history]  # Assign history to callback
        callbacks = CallbackList(callbacks)
        if hasattr(callbacks, 'set_model'):
            callbacks.set_model(self)
        else:
            callbacks._set_model(self)
        callbacks._set_env(env)
        params = {
            'nb_steps': nb_steps,
        }
        if hasattr(callbacks, 'set_params'):
            callbacks.set_params(params)
        else:
            callbacks._set_params(params)
        self._on_train_begin()
        callbacks.on_train_begin()

        episode = np.int16(0)
        self.step = np.int16(0)
        observation = None
        episode_reward = None
        episode_step = None
        #self.episode_step = None  # https://github.com/keras-rl/keras-rl/pull/170/commits/73c073d773ade1ccf70bdcd8f96237473b206ed8
        did_abort = False

        # open workbook to store result
        workbook = xlwt.Workbook()
        sheet = workbook.add_sheet('DQN')
        sheet_step = workbook.add_sheet('step')

        try:
            while self.step < nb_steps:
                if observation is None:  # start of a new episode
                    callbacks.on_episode_begin(episode)
                    episode_step = np.int16(0)
                    #self.episode_step = np.int16(0)  # https://github.com/keras-rl/keras-rl/pull/170/commits/73c073d773ade1ccf70bdcd8f96237473b206ed8
                    episode_reward = np.float32(0)

                    # Obtain the initial observation by resetting the environment.
                    self.reset_states()
                    observation = deepcopy(env.reset())
                    if self.processor is not None:
                        observation = self.processor.process_observation(
                            observation)
                    assert observation is not None

                    # Perform random starts at beginning of episode and do not record them into the experience.
                    # This slightly changes the start position between games.
                    nb_random_start_steps = 0 if nb_max_start_steps == 0 else np.random.randint(
                        nb_max_start_steps)
                    for _ in range(nb_random_start_steps):
                        if start_step_policy is None:
                            action = env.action_space.sample()
                        else:
                            action = start_step_policy(observation)
                        if self.processor is not None:
                            action = self.processor.process_action(action)
                        callbacks.on_action_begin(action)
                        observation, reward, done, info = env.step(action)
                        observation = deepcopy(observation)
                        if self.processor is not None:
                            observation, reward, done, info = self.processor.process_step(
                                observation, reward, done, info)
                        callbacks.on_action_end(action)
                        if done:
                            warnings.warn(
                                'Env ended before {} random steps could be performed at the start. You should probably lower the `nb_max_start_steps` parameter.'
                                .format(nb_random_start_steps))
                            observation = deepcopy(env.reset())
                            if self.processor is not None:
                                observation = self.processor.process_observation(
                                    observation)
                            break

                # At this point, we expect to be fully initialized.
                assert episode_reward is not None
                assert episode_step is not None
                #assert self.episode_step is not None  # https://github.com/keras-rl/keras-rl/pull/170/commits/73c073d773ade1ccf70bdcd8f96237473b206ed8
                assert observation is not None

                # Run a single step.
                callbacks.on_step_begin(episode_step)
                #callbacks.on_step_begin(callbacks.on_step_begin(self.episode_step))  # https://github.com/keras-rl/keras-rl/pull/170/commits/73c073d773ade1ccf70bdcd8f96237473b206ed8
                # This is were all of the work happens. We first perceive and compute the action
                # (forward step) and then use the reward to improve (backward step).
                action = self.forward(observation)
                if self.processor is not None:
                    action = self.processor.process_action(action)
                reward = np.float32(0)
                accumulated_info = {}
                done = False
                for _ in range(action_repetition):
                    callbacks.on_action_begin(action)
                    observation, r, done, info = env.step(action)
                    # print(observation, r, done, info)
                    observation = deepcopy(observation)
                    if self.processor is not None:
                        observation, r, done, info = self.processor.process_step(
                            observation, r, done, info)
                    for key, value in info.items():
                        if not np.isreal(value):
                            continue
                        if key not in accumulated_info:
                            accumulated_info[key] = np.zeros_like(value)
                        accumulated_info[key] += value
                    callbacks.on_action_end(action)
                    reward += r
                    if done:
                        break
                if nb_max_episode_steps and episode_step >= nb_max_episode_steps - 1:
                    #if nb_max_episode_steps and self.episode_step >= nb_max_episode_steps - 1:
                    # Force a terminal state.
                    done = True

                if (custom_env):
                    metrics = self.backward(reward[0],
                                            terminal=done)  # tran's version
                else:
                    metrics = self.backward(
                        reward, terminal=done)  # for testing with dqn_cartpole

                episode_reward += reward

                if (custom_env):
                    step_logs = {
                        'action': action,
                        'observation': observation,
                        'reward': reward[0],  # tran's version
                        'metrics': metrics,
                        'episode': episode,
                        'info': accumulated_info,
                        'throughput': reward[1],
                    }
                else:
                    step_logs = {
                        'action': action,
                        'observation': observation,
                        'reward': reward,  # for testing with dqn_cartpole
                        'metrics': metrics,
                        'episode': episode,
                        'info': accumulated_info,
                    }

                callbacks.on_step_end(episode_step, step_logs)
                episode_step += 1
                #callbacks.on_step_end(self.episode_step, step_logs)  # https://github.com/keras-rl/keras-rl/pull/170/commits/73c073d773ade1ccf70bdcd8f96237473b206ed8
                #self.episode_step += 1  # https://github.com/keras-rl/keras-rl/pull/170/commits/73c073d773ade1ccf70bdcd8f96237473b206ed8
                self.step += 1

                if done:
                    # We are in a terminal state but the agent hasn't yet seen it. We therefore
                    # perform one more forward-backward call and simply ignore the action before
                    # resetting the environment. We need to pass in `terminal=False` here since
                    # the *next* state, that is the state of the newly reset environment, is
                    # always non-terminal by convention.
                    self.forward(observation)
                    self.backward(0., terminal=False)

                    # This episode is finished, report and reset.
                    if (custom_env):
                        episode_logs = {
                            'episode_reward':
                            episode_reward[0],  # Only return the first value
                            'throughput': episode_reward[1],
                            #'nb_episode_steps': episode_step,
                            #'nb_steps': self.step,
                            #'loss': history['loss'],
                        }
                    else:
                        episode_logs = {
                            'episode_reward':
                            episode_reward,  # seems to return an array
                            'nb_episode_steps': episode_step,
                            'nb_steps': self.step,
                        }

                    print("Episode Number: ", episode)
                    print("Episode Rewards: ", episode_reward)
                    #print("Episode Logs", episode_logs)
                    #print("Episode metrics", metrics)
                    print(history.history.keys())

                    #                    print("History Loss", hist.history['loss'])
                    #                    print("History Loss", hist.history['acc'])
                    #                    print("History Loss", hist.history['val_loss'])
                    #                    print("History Loss", hist.history['val_acc'])
                    callbacks.on_episode_end(episode, episode_logs)

                    #print("Episode Reward size is: ", len(episode_reward))
                    #print("Reward array size is: ", episode_reward)

                    sheet.write(episode + 1, 0, str(episode))
                    sheet.write(episode + 1, 1, str(episode_reward[0]))
                    sheet.write(episode + 1, 2, str(episode_reward[1]))
                    #sheet.write(episode + 1, 3, str(episode_reward[2])) # for 2
                    #sheet.write(episode + 1, 4, str(episode_reward[3])) # for 3
                    #sheet.write(episode + 1, 5, str(episode_reward[4])) # for 4

                    episode += 1
                    observation = None
                    #episode_step = None
                    self.episode_step = None  # https://github.com/keras-rl/keras-rl/pull/170/commits/73c073d773ade1ccf70bdcd8f96237473b206ed8
                    episode_reward = None
        except KeyboardInterrupt:
            # We catch keyboard interrupts here so that training can be be safely aborted.
            # This is so common that we've built this right into this function, which ensures that
            # the `on_train_end` method is properly called.
            did_abort = True
        callbacks.on_train_end(logs={'did_abort': did_abort})
        self._on_train_end()

        file_name = 'result_v' + version + '.xls'
        # if (self.enable_double_dqn):
        #     file_name = 'DDQN_' + file_name
        # if (self.enable_dueling_network):
        #     file_name = 'Dueling_' + file_name
        workbook.save('../results/' + file_name)

        return history
Exemplo n.º 19
0
class Agent(object):
    """Abstract base class for all implemented agents.

    Each agent interacts with the environment (as defined by the `Env` class) by first observing the
    state of the environment. Based on this observation the agent changes the environment by performing
    an action.

    Do not use this abstract base class directly but instead use one of the concrete agents implemented.
    Each agent realizes a reinforcement learning algorithm. Since all agents conform to the same
    interface, you can use them interchangeably.

    To implement your own agent, you have to implement the following methods:

    - `forward`
    - `backward`
    - `compile`
    - `load_weights`
    - `save_weights`
    - `layers`

    # Arguments
        processor (`Processor` instance): See [Processor](#processor) for details.
    """
    def __init__(self, processor=None):
        self.processor = processor
        self.training = False
        self.step = 0

    def get_config(self):
        """Configuration of the agent for serialization.
        """
        return {}

    def init_fit_parallel(self,
                          nb_steps=10000,
                          sampler_update_interval=500,
                          action_repetition=1,
                          callbacks=None,
                          verbose=1,
                          visualize=False,
                          nb_max_start_steps=0,
                          start_step_policy=None,
                          log_interval=10000,
                          nb_max_episode_steps=None):
        if not self.compiled:
            raise RuntimeError(
                'Your tried to fit your agent but it hasn\'t been compiled yet. Please call `compile()` before `fit()`.'
            )
        if action_repetition < 1:
            raise ValueError('action_repetition must be >= 1, is {}'.format(
                action_repetition))

        self.training = True

        self.training_callbacks = [
        ]  # if not self.training_callbacks else self.training_callbacks[:]
        if callbacks:
            self.training_callbacks += callbacks

        if verbose == 1:
            self.training_callbacks += [
                TrainIntervalLogger(interval=log_interval)
            ]
        elif verbose > 1:
            self.training_callbacks += [TrainEpisodeLogger()]
        if visualize:
            self.training_callbacks += [Visualizer()]
        self.training_history = History()
        self.training_callbacks += [self.training_history]
        self.training_callbacks = CallbackList(self.training_callbacks)
        if hasattr(self.training_callbacks, 'set_model'):
            self.training_callbacks.set_model(self)
        else:
            self.training_callbacks._set_model(self)
        # # self.training_callbacks._set_env(env)
        params = {
            'nb_steps': nb_steps,
        }
        if hasattr(self.training_callbacks, 'set_params'):
            self.training_callbacks.set_params(params)
        else:
            self.training_callbacks._set_params(params)
        self._on_train_begin()
        self.training_callbacks.on_train_begin()

        self.episode = 0
        self.episode_step = 0
        self.episode_reward = 0.
        self.step = 0
        self.episode_fit_calls = 0
        self.episode_backward_time = dt.timedelta()
        self.episode_n_backward_calls = 0

    def fit_parallel(self,
                     experience_list,
                     sampler_update_interval=500,
                     n_backward_calls=1):
        done = False

        if self.episode_step == 0 and len(
                experience_list) > 0:  # start of a new "episode"
            self.training_callbacks.on_episode_begin(self.episode)

        try:
            self.training_callbacks.on_step_begin(self.episode_step)

            start_time = dt.datetime.now()
            metrics = self.backward(experience_list)
            for _ in range(n_backward_calls - 1):
                metrics = self.backward([])
            end_time = dt.datetime.now()
            elapsed_time = end_time - start_time
            self.episode_backward_time += elapsed_time
            self.episode_n_backward_calls += n_backward_calls

            for e in experience_list:
                step_logs = {
                    'action': e.action,
                    'observation': e.state1,
                    'reward': e.reward,
                    'metrics': metrics,
                    'episode': self.episode,
                    'info': {},
                    'done': e.terminal1,
                    # 'experiences': len(experience_list)
                }
                if hasattr(e, "cumulativereward"):
                    step_logs['cumulativereward'] = e.cumulativereward
                if hasattr(e, "workerid"):
                    step_logs['workerid'] = e.workerid
                if hasattr(e, "epnum"):
                    step_logs['epnum'] = e.epnum
                if hasattr(e, "seed"):
                    step_logs['seed'] = e.seed
                self.training_callbacks.on_step_end(self.episode_step,
                                                    step_logs)
                self.episode_step += 1
                self.step += 1
                self.episode_reward += e.reward
            self.episode_fit_calls += 1

            if self.episode_step >= sampler_update_interval:
                done = True

            if done:
                # We are in a terminal state but the agent hasn't yet seen it. We therefore
                # perform one more forward-backward call and simply ignore the action before
                # resetting the environment. We need to pass in `terminal=False` here since
                # the *next* state, that is the state of the newly reset environment, is
                # always non-terminal by convention.
                #self.forward(observation)
                #self.backward(0., terminal=False)

                # This episode is finished, report and reset.
                episode_logs = {
                    'episode_reward': self.episode_reward,
                    'nb_episode_steps': self.episode_step,
                    'nb_steps': self.step,
                }
                self.training_callbacks.on_episode_end(self.episode,
                                                       episode_logs)

                print(
                    "Main Thread: episode: %d, # new experiences: %d, backward/experience: %.2f, backward/sec: %.2f, backward/fit: %d, backward/ep: %d"
                    %
                    (self.episode + 1, self.episode_step,
                     float(self.episode_n_backward_calls) / self.episode_step,
                     float(self.episode_n_backward_calls) /
                     self.episode_backward_time.total_seconds(),
                     n_backward_calls, self.episode_n_backward_calls))

                self.episode += 1
                self.episode_step = 0
                self.episode_reward = 0.
                self.episode_backward_time = dt.timedelta()
                self.episode_n_backward_calls = 0
                self.episode_fit_calls = 0

                # episode += 1
                # observation = None
                # episode_step = None
                # episode_reward = None
        except KeyboardInterrupt:
            # We catch keyboard interrupts here so that training can be be safely aborted.
            # This is so common that we've built this right into this function, which ensures that
            # the `on_train_end` method is properly called.
            did_abort = True
        # self.training_callbacks.on_train_end(logs={'did_abort': False})
        # self._on_train_end()

        return None

    def fit(self,
            env,
            nb_steps,
            action_repetition=1,
            callbacks=None,
            verbose=1,
            visualize=False,
            nb_max_start_steps=0,
            start_step_policy=None,
            log_interval=10000,
            nb_max_episode_steps=None):
        """Trains the agent on the given environment.
        # Arguments
            env: (`Env` instance): Environment that the agent interacts with. See [Env](#env) for details.
            nb_steps (integer): Number of training steps to be performed.
            action_repetition (integer): Number of times the agent repeats the same action without
                observing the environment again. Setting this to a value > 1 can be useful
                if a single action only has a very small effect on the environment.
            callbacks (list of `keras.callbacks.Callback` or `rl.callbacks.Callback` instances):
                List of callbacks to apply during training. See [callbacks](/callbacks) for details.
            verbose (integer): 0 for no logging, 1 for interval logging (compare `log_interval`), 2 for episode logging
            visualize (boolean): If `True`, the environment is visualized during training. However,
                this is likely going to slow down training significantly and is thus intended to be
                a debugging instrument.
            nb_max_start_steps (integer): Number of maximum steps that the agent performs at the beginning
                of each episode using `start_step_policy`. Notice that this is an upper limit since
                the exact number of steps to be performed is sampled uniformly from [0, max_start_steps]
                at the beginning of each episode.
            start_step_policy (`lambda observation: action`): The policy
                to follow if `nb_max_start_steps` > 0. If set to `None`, a random action is performed.
            log_interval (integer): If `verbose` = 1, the number of steps that are considered to be an interval.
            nb_max_episode_steps (integer): Number of steps per episode that the agent performs before
                automatically resetting the environment. Set to `None` if each episode should run
                (potentially indefinitely) until the environment signals a terminal state.
        # Returns
            A `keras.callbacks.History` instance that recorded the entire training process.
        """
        if not self.compiled:
            raise RuntimeError(
                'Your tried to fit your agent but it hasn\'t been compiled yet. Please call `compile()` before `fit()`.'
            )
        if action_repetition < 1:
            raise ValueError('action_repetition must be >= 1, is {}'.format(
                action_repetition))

        self.training = True

        callbacks = [] if not callbacks else callbacks[:]

        if verbose == 1:
            callbacks += [TrainIntervalLogger(interval=log_interval)]
        elif verbose > 1:
            callbacks += [TrainEpisodeLogger()]
        if visualize:
            callbacks += [Visualizer()]
        history = History()
        callbacks += [history]
        callbacks = CallbackList(callbacks)
        if hasattr(callbacks, 'set_model'):
            callbacks.set_model(self)
        else:
            callbacks._set_model(self)
        callbacks._set_env(env)
        params = {
            'nb_steps': nb_steps,
        }
        if hasattr(callbacks, 'set_params'):
            callbacks.set_params(params)
        else:
            callbacks._set_params(params)
        self._on_train_begin()
        callbacks.on_train_begin()

        episode = 0
        self.step = 0
        observation = None
        episode_reward = None
        episode_step = None
        did_abort = False
        try:
            while self.step < nb_steps:
                if observation is None:  # start of a new episode
                    callbacks.on_episode_begin(episode)
                    episode_step = 0
                    episode_reward = 0.

                    # Obtain the initial observation by resetting the environment.
                    self.reset_states()
                    observation = deepcopy(env.reset())
                    if self.processor is not None:
                        observation = self.processor.process_observation(
                            observation)
                    assert observation is not None

                    # Perform random starts at beginning of episode and do not record them into the experience.
                    # This slightly changes the start position between games.
                    nb_random_start_steps = 0 if nb_max_start_steps == 0 else np.random.randint(
                        nb_max_start_steps)
                    for _ in range(nb_random_start_steps):
                        if start_step_policy is None:
                            action = env.action_space.sample()
                        else:
                            action = start_step_policy(observation)
                        if self.processor is not None:
                            action = self.processor.process_action(action)
                        callbacks.on_action_begin(action)
                        observation, reward, done, info = env.step(action)
                        observation = deepcopy(observation)
                        if self.processor is not None:
                            observation, reward, done, info = self.processor.process_step(
                                observation, reward, done, info)
                        callbacks.on_action_end(action)
                        if done:
                            warnings.warn(
                                'Env ended before {} random steps could be performed at the start. You should probably lower the `nb_max_start_steps` parameter.'
                                .format(nb_random_start_steps))
                            observation = deepcopy(env.reset())
                            if self.processor is not None:
                                observation = self.processor.process_observation(
                                    observation)
                            break

                # At this point, we expect to be fully initialized.
                assert episode_reward is not None
                assert episode_step is not None
                assert observation is not None

                # Run a single step.
                callbacks.on_step_begin(episode_step)
                # This is were all of the work happens. We first perceive and compute the action
                # (forward step) and then use the reward to improve (backward step).
                action = self.forward(observation)
                if self.processor is not None:
                    action = self.processor.process_action(action)
                reward = 0.
                accumulated_info = {}
                done = False
                for _ in range(action_repetition):
                    callbacks.on_action_begin(action)
                    observation, r, done, info = env.step(action)
                    observation = deepcopy(observation)
                    if self.processor is not None:
                        observation, r, done, info = self.processor.process_step(
                            observation, r, done, info)
                    for key, value in info.items():
                        if not np.isreal(value):
                            continue
                        if key not in accumulated_info:
                            accumulated_info[key] = np.zeros_like(value)
                        accumulated_info[key] += value
                    callbacks.on_action_end(action)
                    reward += r
                    if done:
                        break
                if nb_max_episode_steps and episode_step >= nb_max_episode_steps - 1:
                    # Force a terminal state.
                    done = True
                metrics = self.backward(reward, terminal=done)
                episode_reward += reward

                step_logs = {
                    'action': action,
                    'observation': observation,
                    'reward': reward,
                    'metrics': metrics,
                    'episode': episode,
                    'info': accumulated_info,
                }
                callbacks.on_step_end(episode_step, step_logs)
                episode_step += 1
                self.step += 1

                if done:
                    # We are in a terminal state but the agent hasn't yet seen it. We therefore
                    # perform one more forward-backward call and simply ignore the action before
                    # resetting the environment. We need to pass in `terminal=False` here since
                    # the *next* state, that is the state of the newly reset environment, is
                    # always non-terminal by convention.
                    self.forward(observation)
                    self.backward(0., terminal=False)

                    # This episode is finished, report and reset.
                    episode_logs = {
                        'episode_reward': episode_reward,
                        'nb_episode_steps': episode_step,
                        'nb_steps': self.step,
                    }
                    callbacks.on_episode_end(episode, episode_logs)

                    episode += 1
                    observation = None
                    episode_step = None
                    episode_reward = None
        except KeyboardInterrupt:
            # We catch keyboard interrupts here so that training can be be safely aborted.
            # This is so common that we've built this right into this function, which ensures that
            # the `on_train_end` method is properly called.
            did_abort = True
        callbacks.on_train_end(logs={'did_abort': did_abort})
        self._on_train_end()

        return history

    def test(self,
             env,
             nb_episodes=1,
             action_repetition=1,
             callbacks=None,
             visualize=True,
             nb_max_episode_steps=None,
             nb_max_start_steps=0,
             start_step_policy=None,
             verbose=1):
        """Callback that is called before training begins."
        """
        if not self.compiled:
            raise RuntimeError(
                'Your tried to test your agent but it hasn\'t been compiled yet. Please call `compile()` before `test()`.'
            )
        if action_repetition < 1:
            raise ValueError('action_repetition must be >= 1, is {}'.format(
                action_repetition))

        self.training = False
        self.step = 0

        callbacks = [] if not callbacks else callbacks[:]

        if verbose >= 1:
            callbacks += [TestLogger()]
        if visualize:
            callbacks += [Visualizer()]
        history = History()
        callbacks += [history]
        callbacks = CallbackList(callbacks)
        if hasattr(callbacks, 'set_model'):
            callbacks.set_model(self)
        else:
            callbacks._set_model(self)
        callbacks._set_env(env)
        params = {
            'nb_episodes': nb_episodes,
        }
        if hasattr(callbacks, 'set_params'):
            callbacks.set_params(params)
        else:
            callbacks._set_params(params)

        self._on_test_begin()
        callbacks.on_train_begin()
        for episode in range(nb_episodes):
            callbacks.on_episode_begin(episode)
            episode_reward = 0.
            episode_step = 0

            # Obtain the initial observation by resetting the environment.
            self.reset_states()
            observation = deepcopy(env.reset())
            if self.processor is not None:
                observation = self.processor.process_observation(observation)
            assert observation is not None

            # Perform random starts at beginning of episode and do not record them into the experience.
            # This slightly changes the start position between games.
            nb_random_start_steps = 0 if nb_max_start_steps == 0 else np.random.randint(
                nb_max_start_steps)
            for _ in range(nb_random_start_steps):
                if start_step_policy is None:
                    action = env.action_space.sample()
                else:
                    action = start_step_policy(observation)
                if self.processor is not None:
                    action = self.processor.process_action(action)
                callbacks.on_action_begin(action)
                observation, r, done, info = env.step(action)
                observation = deepcopy(observation)
                if self.processor is not None:
                    observation, r, done, info = self.processor.process_step(
                        observation, r, done, info)
                callbacks.on_action_end(action)
                if done:
                    warnings.warn(
                        'Env ended before {} random steps could be performed at the start. You should probably lower the `nb_max_start_steps` parameter.'
                        .format(nb_random_start_steps))
                    observation = deepcopy(env.reset())
                    if self.processor is not None:
                        observation = self.processor.process_observation(
                            observation)
                    break

            # Run the episode until we're done.
            done = False
            while not done:
                callbacks.on_step_begin(episode_step)

                action = self.forward(observation)
                if self.processor is not None:
                    action = self.processor.process_action(action)
                reward = 0.
                accumulated_info = {}
                for _ in range(action_repetition):
                    callbacks.on_action_begin(action)
                    observation, r, d, info = env.step(action)
                    observation = deepcopy(observation)
                    if self.processor is not None:
                        observation, r, d, info = self.processor.process_step(
                            observation, r, d, info)
                    callbacks.on_action_end(action)
                    reward += r
                    for key, value in info.items():
                        if not np.isreal(value):
                            continue
                        if key not in accumulated_info:
                            accumulated_info[key] = np.zeros_like(value)
                        accumulated_info[key] += value
                    if d:
                        done = True
                        break
                if nb_max_episode_steps and episode_step >= nb_max_episode_steps - 1:
                    done = True
                self.backward(reward, observation, terminal=done)
                episode_reward += reward

                step_logs = {
                    'action': action,
                    'observation': observation,
                    'reward': reward,
                    'episode': episode,
                    'info': accumulated_info,
                }
                callbacks.on_step_end(episode_step, step_logs)
                episode_step += 1
                self.step += 1

            # We are in a terminal state but the agent hasn't yet seen it. We therefore
            # perform one more forward-backward call and simply ignore the action before
            # resetting the environment. We need to pass in `terminal=False` here since
            # the *next* state, that is the state of the newly reset environment, is
            # always non-terminal by convention.
            #self.forward(observation)
            #self.backward(0., terminal=False)

            # Report end of episode.
            episode_logs = {
                'episode_reward': episode_reward,
                'nb_steps': episode_step,
            }
            callbacks.on_episode_end(episode, episode_logs)
        callbacks.on_train_end()
        self._on_test_end()

        return history

    def reset_states(self):
        """Resets all internally kept states after an episode is completed.
        """
        pass

    def forward(self, observation):
        """Takes the an observation from the environment and returns the action to be taken next.
        If the policy is implemented by a neural network, this corresponds to a forward (inference) pass.

        # Argument
            observation (object): The current observation from the environment.

        # Returns
            The next action to be executed in the environment.
        """
        raise NotImplementedError()

    def backward(self, reward, nextobservation, terminal):
        """Updates the agent after having executed the action returned by `forward`.
        If the policy is implemented by a neural network, this corresponds to a weight update using back-prop.

        # Argument
            reward (float): The observed reward after executing the action returned by `forward`.
            terminal (boolean): `True` if the new state of the environment is terminal.
        """
        raise NotImplementedError()

    def compile(self, optimizer, metrics=[]):
        """Compiles an agent and the underlaying models to be used for training and testing.

        # Arguments
            optimizer (`keras.optimizers.Optimizer` instance): The optimizer to be used during training.
            metrics (list of functions `lambda y_true, y_pred: metric`): The metrics to run during training.
        """
        raise NotImplementedError()

    def load_weights(self, filepath):
        """Loads the weights of an agent from an HDF5 file.

        # Arguments
            filepath (str): The path to the HDF5 file.
        """
        raise NotImplementedError()

    def save_weights(self, filepath, overwrite=False):
        """Saves the weights of an agent as an HDF5 file.

        # Arguments
            filepath (str): The path to where the weights should be saved.
            overwrite (boolean): If `False` and `filepath` already exists, raises an error.
        """
        raise NotImplementedError()

    @property
    def layers(self):
        """Returns all layers of the underlying model(s).
        
        If the concrete implementation uses multiple internal models,
        this method returns them in a concatenated list.
        """
        raise NotImplementedError()

    @property
    def metrics_names(self):
        """The human-readable names of the agent's metrics. Must return as many names as there
        are metrics (see also `compile`).
        """
        return []

    def _on_train_begin(self):
        """Callback that is called before training begins."
        """
        pass

    def _on_train_end(self):
        """Callback that is called after training ends."
        """
        pass

    def _on_test_begin(self):
        """Callback that is called before testing begins."
        """
        pass

    def _on_test_end(self):
        """Callback that is called after testing ends."
        """
        pass
Exemplo n.º 20
0
    def test(self,
             env,
             nb_episodes=1,
             action_repetition=1,
             callbacks=None,
             visualize=True,
             nb_max_episode_steps=None,
             nb_max_start_steps=0,
             start_step_policy=None,
             verbose=1):
        if not self.compiled:
            raise RuntimeError(
                'Your tried to test your agent but it hasn\'t been compiled yet. Please call `compile()` before `test()`.'
            )
        if action_repetition < 1:
            raise ValueError('action_repetition must be >= 1, is {}'.format(
                action_repetition))

        self.training = False
        self.step = 0

        callbacks = [] if not callbacks else callbacks[:]

        if verbose >= 1:
            callbacks += [TestLogger()]
        if visualize:
            callbacks += [Visualizer()]
        history = History()
        callbacks += [history]
        callbacks = CallbackList(callbacks)
        callbacks._set_model(self)
        callbacks._set_env(env)
        callbacks._set_params({
            'nb_episodes': nb_episodes,
        })

        self._on_test_begin()
        callbacks.on_train_begin()
        for episode in range(nb_episodes):
            callbacks.on_episode_begin(episode)
            episode_reward = 0.
            episode_step = 0

            # Obtain the initial observation by resetting the environment.
            self.reset_states()
            observation = env.reset()
            assert observation is not None

            # Perform random starts at beginning of episode and do not record them into the experience.
            # This slightly changes the start position between games.
            nb_random_start_steps = 0 if nb_max_start_steps == 0 else np.random.randint(
                nb_max_start_steps)
            for _ in range(nb_random_start_steps):
                if start_step_policy is None:
                    action = env.action_space.sample()
                else:
                    action = start_step_policy(observation)
                callbacks.on_action_begin(action)
                observation, _, done, _ = env.step(action)
                callbacks.on_action_end(action)
                if done:
                    warnings.warn(
                        'Env ended before {} random steps could be performed at the start. You should probably lower the `nb_max_start_steps` parameter.'
                        .format(nb_random_start_steps))
                    observation = env.reset()
                    break

            # Run the episode until we're done.
            done = False
            while not done:
                callbacks.on_step_begin(episode_step)

                action = self.forward(observation)
                reward = 0.
                for _ in range(action_repetition):
                    callbacks.on_action_begin(action)
                    observation, r, d, _ = env.step(action)
                    callbacks.on_action_end(action)
                    reward += r
                    if d:
                        done = True
                        break
                if nb_max_episode_steps and episode_step >= nb_max_episode_steps - 1:
                    done = True
                self.backward(reward, terminal=done)
                episode_reward += reward

                callbacks.on_step_end(episode_step)
                episode_step += 1
                self.step += 1

            # We are in a terminal state but the agent hasn't yet seen it. We therefore
            # perform one more forward-backward call and simply ignore the action before
            # resetting the environment. We need to pass in `terminal=False` here since
            # the *next* state, that is the state of the newly reset environment, is
            # always non-terminal by convention.
            self.forward(observation)
            self.backward(0., terminal=False)

            # Report end of episode.
            episode_logs = {
                'episode_reward': episode_reward,
                'nb_steps': episode_step,
            }
            callbacks.on_episode_end(episode, episode_logs)
        callbacks.on_train_end()
        self._on_test_end()

        return history
Exemplo n.º 21
0
	def fit(self, env, nb_steps, action_repetition=1, callbacks=None, verbose=1,
            visualize=False, nb_max_start_steps=0, start_step_policy=None, log_interval=10000,
            nb_max_episode_steps=None):

		for dqagent in self.dqagents:
        	if not dqagent.compiled:
            	raise RuntimeError(
                	'Your tried to fit your agents but one hasn\'t been compiled yet. Please call `compile()` before `fit()`.')
        	if action_repetition < 1:
            	raise ValueError(
                	'action_repetition must be >= 1, is {}'.format(action_repetition))

        self.training = True

        callbacks = [] if not callbacks else callbacks[:]

        if verbose == 1:
            callbacks += [TrainIntervalLogger(interval=log_interval)]
        elif verbose > 1:
            callbacks += [TrainEpisodeLogger()]
        if visualize:
            callbacks += [Visualizer()]
        history = History()
        callbacks += [history]
        callbacks = CallbackList(callbacks)
        if hasattr(callbacks, 'set_model'):
            callbacks.set_model(self)
        else:
            callbacks._set_model(self)
        callbacks._set_env(env)
        params = {
            'nb_steps': nb_steps,
        }
        if hasattr(callbacks, 'set_params'):
            callbacks.set_params(params)
        else:
            callbacks._set_params(params)
        self._on_train_begin()
        callbacks.on_train_begin()

        episode = np.int16(0)
        self.step = np.int16(0)
        observations = []
        episode_reward = None
        episode_step = None
        did_abort = False
        try:
            while self.step < nb_steps:
                # check if observations is empty
                if observations == []:  # start of a new episode
                    callbacks.on_episode_begin(episode)
                    episode_step = np.int16(0)
                    episode_reward = np.float32([0,0])

                    # Obtain the initial observation by resetting the environment.
                    self.dqagent[0].reset_states()
                    self.dqagent[1].reset_states()
                    observations = deepcopy(env.reset())
                    if self.processor is not None:
                        # process all observations
                        observations = [self.processor.process_observation(
                            observation) for observation in observations]
                    assert observations != []

                    # Perform random starts at beginning of episode and do not record them into the experience.
                    # This slightly changes the start position between games.

                    # can remove this bit, not gonna use any random starts
                    nb_random_start_steps = 0 if nb_max_start_steps == 0 else np.random.randint(
                        nb_max_start_steps)
                    for _ in range(nb_random_start_steps):
                        if start_step_policy is None:
                            actions = env.action_space.sample()
                        else:
                            actions = start_step_policy(observation)
                        if self.processor is not None:
                            actions = self.processor.process_action(action)
                        callbacks.on_action_begin(action)
                        observations, rewards, done, info = env.step(action)
                        observations = deepcopy(observations)
                        if self.processor is not None:
                            observations, rewards, done, info = self.processor.process_step(
                                observations, rewards, done, info)
                        callbacks.on_action_end(action)
                        if done:
                            warnings.warn('Env ended before {} random steps could be performed at the start. You should probably lower the `nb_max_start_steps` parameter.'.format(
                                nb_random_start_steps))
                            observations = deepcopy(env.reset())
                            if self.processor is not None:
                                observations = [self.processor.process_observation(
                                    observation) for observation in observations]
                            break

                # At this point, we expect to be fully initialized.
                assert episode_reward is not None
                assert episode_step is not None
                assert observations != []

                # Run a single step.
                callbacks.on_step_begin(episode_step)
                # This is were all of the work happens. We first perceive and compute the action
                # (forward step) and then use the reward to improve (backward step).

                # given incides [0,3] are hider indices and [4,5] are seeker indices
                actions = []
                for i in range(2,6):
                    actions.append(self.dqagents[0].forward(observations[i]))
                for i in range(0,2):
                    actions.append(self.dqagents[1].forward(observations[i]))

                # process all actions
                if self.processor is not None:
                    actions = [self.processor.process_action(action) for action in actions]
                rewards = np.float32([0,0])
                accumulated_info = {}
                done = False
                for _ in range(action_repetition):
                    callbacks.on_action_begin(actions)
                    # expect rs[0] to be aggregate hider reward, rs[1] aggregate seeker reward
                    observations, rs, done, info = env.step(actions)
                    observations = deepcopy(observations)
                    if self.processor is not None:
                        observations, rs, done, info = self.processor.process_step(
                            observations, rs, done, info)
                    for key, value in info.items():
                        if not np.isreal(value):
                            continue
                        if key not in accumulated_info:
                            accumulated_info[key] = np.zeros_like(value)
                        accumulated_info[key] += value
                    callbacks.on_action_end(actions)
                    hider_reward += rs[0]
                    seeker_reward += rs[1]
                    rewards += rs
                    if done:
                        break
                if nb_max_episode_steps and episode_step >= nb_max_episode_steps - 1:
                    # Force a terminal state.
                    done = True

                # run backwrd step wrt each agent's respective aggregate reward
                hider_metrics = self.dqagents[0].backward(hider_reward, terminal=done)
                seeker_metrics = self.dqagents[1].backward(seeker_reward, terminal=done)
                episode_reward += rewards

                step_logs = {
                    'actions': actions,
                    'observations': observations,
                    'hider_reward': hider_reward,
                    'hider_metrics': hider_metrics,
                    'seeker_reward': seeker_reward,
                    'seeker_metrics': seeker_metrics,
                    'episode': episode,
                    'info': accumulated_info,
                }
                callbacks.on_step_end(episode_step, step_logs)
                episode_step += 1
                self.step += 1

                if done:
                    # We are in a terminal state but the agent hasn't yet seen it. We therefore
                    # perform one more forward-backward call and simply ignore the action before
                    # resetting the environment. We need to pass in `terminal=False` here since
                    # the *next* state, that is the state of the newly reset environment, is
                    # always non-terminal by convention.
                    for i in range(2,6):
                        self.dqagents[0].forward(observations[i])
                    for i in range(0,2):
                        self.dqagents[1].forward(observations[i])

                    self.dqagents[0].backward(0., terminal=False)
                    self.dqagents[1].backward(0., terminal=False)

                    # This episode is finished, report and reset.
                    episode_logs = {
                        'episode_reward': episode_reward,
                        'nb_episode_steps': episode_step,
                        'nb_steps': self.step,
                    }
                    callbacks.on_episode_end(episode, episode_logs)

                    episode += 1
                    observations = []
                    episode_step = None
                    episode_reward = None
        except KeyboardInterrupt:
            # We catch keyboard interrupts here so that training can be be safely aborted.
            # This is so common that we've built this right into this function, which ensures that
            # the `on_train_end` method is properly called.
            did_abort = True
        callbacks.on_train_end(logs={'did_abort': did_abort})
        self._on_train_end()

        return history

    def test(self, env, nb_episodes=1, action_repetition=1, callbacks=None, visualize=True,
             nb_max_episode_steps=None, nb_max_start_steps=0, start_step_policy=None, verbose=1):
        """Callback that is called before training begins.

        # Arguments
            env: (`Env` instance): Environment that the agent interacts with. See [Env](#env) for details.
            nb_episodes (integer): Number of episodes to perform.
            action_repetition (integer): Number of times the agent repeats the same action without
                observing the environment again. Setting this to a value > 1 can be useful
                if a single action only has a very small effect on the environment.
            callbacks (list of `keras.callbacks.Callback` or `rl.callbacks.Callback` instances):
                List of callbacks to apply during training. See [callbacks](/callbacks) for details.
            verbose (integer): 0 for no logging, 1 for interval logging (compare `log_interval`), 2 for episode logging
            visualize (boolean): If `True`, the environment is visualized during training. However,
                this is likely going to slow down training significantly and is thus intended to be
                a debugging instrument.
            nb_max_start_steps (integer): Number of maximum steps that the agent performs at the beginning
                of each episode using `start_step_policy`. Notice that this is an upper limit since
                the exact number of steps to be performed is sampled uniformly from [0, max_start_steps]
                at the beginning of each episode.
            start_step_policy (`lambda observation: action`): The policy
                to follow if `nb_max_start_steps` > 0. If set to `None`, a random action is performed.
            log_interval (integer): If `verbose` = 1, the number of steps that are considered to be an interval.
            nb_max_episode_steps (integer): Number of steps per episode that the agent performs before
                automatically resetting the environment. Set to `None` if each episode should run
                (potentially indefinitely) until the environment signals a terminal state.

        # Returns
            A `keras.callbacks.History` instance that recorded the entire training process.
        """
		for dqagent in self.dqagents:
        	if not dqagent.compiled:
            	raise RuntimeError(
                	'Your tried to fit your agents but one hasn\'t been compiled yet. Please call `compile()` before `fit()`.')
        	if action_repetition < 1:
            	raise ValueError(
                	'action_repetition must be >= 1, is {}'.format(action_repetition))

        self.training = False
        self.step = 0

        callbacks = [] if not callbacks else callbacks[:]

        if verbose >= 1:
            callbacks += [TestLogger()]
        if visualize:
            callbacks += [Visualizer()]
        history = History()
        callbacks += [history]
        callbacks = CallbackList(callbacks)
        if hasattr(callbacks, 'set_model'):
            callbacks.set_model(self)
        else:
            callbacks._set_model(self)
        callbacks._set_env(env)
        params = {
            'nb_episodes': nb_episodes,
        }
        if hasattr(callbacks, 'set_params'):
            callbacks.set_params(params)
        else:
            callbacks._set_params(params)

        self._on_test_begin()
        callbacks.on_train_begin()
        for episode in range(nb_episodes):
            callbacks.on_episode_begin(episode)
            episode_reward = [0,0]
            episode_step = 0

            # Obtain the initial observation by resetting the environment.
            self.dqagent[0].reset_states()
            self.dqagent[1].reset_states()
            observations = deepcopy(env.reset())
            if self.processor is not None:
                observations = [self.processor.process_observation(observation) for observation in observations]
            assert observations != []

            # Perform random starts at beginning of episode and do not record them into the experience.
            # This slightly changes the start position between games.

            # this is never executed with default args.
            nb_random_start_steps = 0 if nb_max_start_steps == 0 else np.random.randint(
                nb_max_start_steps)
            for _ in range(nb_random_start_steps):
                if start_step_policy is None:
                    action = env.action_space.sample()
                else:
                    action = start_step_policy(observation)
                if self.processor is not None:
                    action = self.processor.process_action(action)
                callbacks.on_action_begin(action)
                observations, rs, done, info = env.step(action)
                observations = deepcopy(observations)
                if self.processor is not None:
                    observation, r, done, info = self.processor.process_step(
                        observation, r, done, info)
                callbacks.on_action_end(action)
                if done:
                    warnings.warn('Env ended before {} random steps could be performed at the start. You should probably lower the `nb_max_start_steps` parameter.'.format(
                        nb_random_start_steps))
                    observation = deepcopy(env.reset())
                    if self.processor is not None:
                        observation = self.processor.process_observation(
                            observation)
                    break

            # Run the episode until we're done.
            done = False
            while not done:
                callbacks.on_step_begin(episode_step)

                actions = []
                for i in range(2,6):
                    actions.append(self.dqagents[0].forward(observations[i]))
                for i in range(0,2):
                    actions.append(self.dqagents[1].forward(observations[i]))

                if self.processor is not None:
                    actions = [self.processor.process_action(action) for action in actions]
                rewards = [0.,0.]
                accumulated_info = {}
                for _ in range(action_repetition):
                    callbacks.on_action_begin(action)
                    observations, rs, d, info = env.step(actions)
                    observations = deepcopy(observations)
                    if self.processor is not None:
                        observations, rs, d, info = self.processor.process_step(
                            observations, rs, d, info)
                    callbacks.on_action_end(actions)
                    hider_reward += rs[0]
                    seeker_reward += rs[1]
                    rewards += rs
                    for key, value in info.items():
                        if not np.isreal(value):
                            continue
                        if key not in accumulated_info:
                            accumulated_info[key] = np.zeros_like(value)
                        accumulated_info[key] += value
                    if d:
                        done = True
                        break
                if nb_max_episode_steps and episode_step >= nb_max_episode_steps - 1:
                    done = True
                self.dqagent[0].backward(hider_reward, terminal=done)
                self.dqagent[1].backward(seeker_reward, terminal=done)
                episode_reward += rewards

                step_logs = {
                    'action': actions,
                    'observation': observations,
                    'rewards': rewards,
                    'episode': episode,
                    'info': accumulated_info,
                }
                callbacks.on_step_end(episode_step, step_logs)
                episode_step += 1
                self.step += 1

            # We are in a terminal state but the agent hasn't yet seen it. We therefore
            # perform one more forward-backward call and simply ignore the action before
            # resetting the environment. We need to pass in `terminal=False` here since
            # the *next* state, that is the state of the newly reset environment, is
            # always non-terminal by convention.
            for i in range(2,6):
                self.dqagent[0].forward(observations[i])
            for i in range(0,2):
                self.dqagent[1].forward(observations[i])

            self.dqagent[0].backward(0., terminal=False)
            self.dqagent[1].backward(0., terminal=False)

            # Report end of episode.
            episode_logs = {
                'episode_reward': episode_reward,
                'nb_steps': episode_step,
            }
            callbacks.on_episode_end(episode, episode_logs)
        callbacks.on_train_end()
        self._on_test_end()

        return history

    def _on_train_begin(self):
        """Callback that is called before training begins."
        """
        pass

    def _on_train_end(self):
        """Callback that is called after training ends."
        """
        pass

    def _on_test_begin(self):
        """Callback that is called before testing begins."
        """
        pass

    def _on_test_end(self):
        """Callback that is called after testing ends."
        """
        pass

class MultiProcessor(object):
    """Abstract base class for implementing processors.

    A processor acts as a coupling mechanism between an `Agent` and its `Env`. This can
    be necessary if your agent has different requirements with respect to the form of the
    observations, actions, and rewards of the environment. By implementing a custom processor,
    you can effectively translate between the two without having to change the underlaying
    implementation of the agent or environment.

    Do not use this abstract base class directly but instead use one of the concrete implementations
    or write your own.
    """

    def process_step(self, observations, rewards, done, info):
        """Processes an entire step by applying the processor to the observation, reward, and info arguments.

        # Arguments
            observation (object): An observation as obtained by the environment.
            reward (float): A reward as obtained by the environment.
            done (boolean): `True` if the environment is in a terminal state, `False` otherwise.
            info (dict): The debug info dictionary as obtained by the environment.

        # Returns
            The tupel (observation, reward, done, reward) with with all elements after being processed.
        """
        observations = [self.process_observation(observation) for observation in observations]
        rewards = [self.process_reward(reward) for reward in rewards]
        info = self.process_info(info)
        return observations, rewards, done, info

    def process_observation(self, observation):
        """Processes the observation as obtained from the environment for use in an agent and
        returns it.

        # Arguments
            observation (object): An observation as obtained by the environment

        # Returns
            Observation obtained by the environment processed
        """
        return observation

    def process_reward(self, reward):
        """Processes the reward as obtained from the environment for use in an agent and
        returns it.

        # Arguments
            reward (float): A reward as obtained by the environment

        # Returns
            Reward obtained by the environment processed
        """
        return reward

    def process_info(self, info):
        """Processes the info as obtained from the environment for use in an agent and
        returns it.

        # Arguments
            info (dict): An info as obtained by the environment

        # Returns
            Info obtained by the environment processed
        """
        return info

    def process_action(self, action):
        """Processes an action predicted by an agent but before execution in an environment.

        # Arguments
            action (int): Action given to the environment

        # Returns
            Processed action given to the environment
        """
        return action

    def process_state_batch(self, batch):
        """Processes an entire batch of states and returns it.

        # Arguments
            batch (list): List of states

        # Returns
            Processed list of states
        """
        return batch

    @property
    def metrics(self):
        """The metrics of the processor, which will be reported during training.

        # Returns
            List of `lambda y_true, y_pred: metric` functions.
        """
        return []

    @property
    def metrics_names(self):
        """The human-readable names of the agent's metrics. Must return as many names as there
        are metrics (see also `compile`).
        """
        return []
Exemplo n.º 22
0
    def _run(self,
             env,
             nb_steps=None,
             nb_episodes=None,
             training=True,
             action_repetition=1,
             callbacks=None,
             verbose=1,
             visualize=False,
             nb_max_start_steps=0,
             start_step_policy=None,
             log_interval=10000,
             nb_max_episode_steps=None,
             reward_scaling=1.):
        """Trains the agent on the given environment.

        # Arguments
            env: (`Env` instance): Environment that the agent interacts with. See [Env](#env) for details.
            nb_steps (integer): Number of training steps to be performed.
            nb_episodes (integer): Number of episodes to perform
            training (boolean): Whether to train or test the agent
            action_repetition (integer): Number of times the agent repeats the same action without
                observing the environment again. Setting this to a value > 1 can be useful
                if a single action only has a very small effect on the environment.
            callbacks (list of `keras.callbacks.Callback` or `rl.callbacks.Callback` instances):
                List of callbacks to apply during training. See [callbacks](/callbacks) for details.
            verbose (integer): 0 for no logging, 1 for interval logging (compare `log_interval`), 2 for episode logging
            visualize (boolean): If `True`, the environment is visualized during training. However,
                this is likely going to slow down training significantly and is thus intended to be
                a debugging instrument.
            nb_max_start_steps (integer): Number of maximum steps that the agent performs at the beginning
                of each episode using `start_step_policy`. Notice that this is an upper limit since
                the exact number of steps to be performed is sampled uniformly from [0, max_start_steps]
                at the beginning of each episode.
            start_step_policy (`lambda observation: action`): The policy
                to follow if `nb_max_start_steps` > 0. If set to `None`, a random action is performed.
            log_interval (integer): If `verbose` = 1, the number of steps that are considered to be an interval.
            nb_max_episode_steps (integer): Number of steps per episode that the agent performs before
                automatically resetting the environment. Set to `None` if each episode should run
                (potentially indefinitely) until the environment signals a terminal state.
            reward_scaling (float): The amount with which the reward will be scaled

        # Returns
            A `keras.callbacks.History` instance that recorded the entire training process.
        """
        if not self.compiled:
            raise RuntimeError(
                'Your tried to fit your agent but it hasn\'t been compiled yet. Please call `compile()` before `fit()`.'
            )
        if action_repetition < 1:
            raise ValueError('action_repetition must be >= 1, is {}'.format(
                action_repetition))

        # Process the different cases when either nb_steps or nb_episodes are specified
        if (nb_steps is None and nb_episodes is None):
            raise (ValueError(
                "Please specify one (and only one) of nb_steps and nb_episodes"
            ))
        elif (nb_steps is not None and nb_episodes is None):
            termination_criterion = STEPS_TERMINATION
        elif (nb_steps is None and nb_episodes is not None):
            termination_criterion = EPISODES_TERMINATION
        elif (nb_steps is not None and nb_episodes is not None):
            raise (ValueError(
                "Please specify one (and only one) of nb_steps and nb_episodes"
            ))

        self.training = training

        # Initialize callbacks
        if callbacks is None:
            callbacks = []
        if self.training:
            if verbose == 1:
                callbacks += [TrainIntervalLogger(interval=log_interval)]
            elif verbose > 1:
                callbacks += [TrainEpisodeLogger()]
        else:
            if verbose >= 1:
                callbacks += [TestLogger()]
        callbacks = [] if not callbacks else callbacks[:]
        if visualize:
            callbacks += [Visualizer()]
        history = History()
        callbacks += [history]
        callbacks = CallbackList(callbacks)
        if hasattr(callbacks, 'set_model'):
            callbacks.set_model(self)
        else:
            callbacks._set_model(self)
        callbacks._set_env(env)

        if termination_criterion == STEPS_TERMINATION:
            params = {
                'nb_steps': nb_steps,
            }
        elif termination_criterion == EPISODES_TERMINATION:
            params = {
                'nb_episodes': nb_episodes,
                'nb_steps': 1,
            }
        if hasattr(callbacks, 'set_params'):
            callbacks.set_params(params)
        else:
            callbacks._set_params(params)

        # Initialize the Hooks
        hooks = Hooks(self,
                      [TensorboardHook(),
                       PortraitHook(),
                       TrajectoryHook()])

        # Define the termination criterion
        # Step and episode at which we satrt the function
        start_step = self.step
        start_episode = self.episode
        if termination_criterion == STEPS_TERMINATION:

            def termination():
                return (self.step - start_step > nb_steps)
        elif termination_criterion == EPISODES_TERMINATION:

            def termination():
                return (self.episode - start_episode > nb_episodes)

        if self.training:
            self._on_train_begin()
        else:
            self._on_test_begin()

        callbacks.on_train_begin()

        # Setup
        self.done = True
        did_abort = False
        # Define these for clarification, not mandatory:
        # Where observation_0: Observation before the step
        # observation_1: Observation after the step
        observation_0 = None
        observation_1 = None
        self.step_summaries = None

        try:
            # Run steps (and episodes) until the termination criterion is met
            while not (termination()):
                # Init episode
                # If we are at the beginning of a new episode, execute a startup sequence
                if self.done:
                    self.episode += 1
                    self.episode_reward = 0.
                    self.episode_step = 0
                    callbacks.on_episode_begin(self.episode)

                    # Obtain the initial observation by resetting the environment.
                    self.reset_states()
                    observation_0 = deepcopy(env.reset())
                    assert observation_0 is not None

                    # Perform random steps at beginning of episode and do not record them into the experience.
                    # This slightly changes the start position between games.
                    if nb_max_start_steps != 0:
                        observation_0 = self._perform_random_steps(
                            nb_max_start_steps, start_step_policy, env,
                            observation_0, callbacks)

                else:
                    # We are in the middle of an episode
                    # Update the observation
                    observation_0 = observation_1
                    # Increment the episode step

                # FIXME: Use only one of the two variables
                self.observation = observation_0

                # Increment the current step in both cases
                self.step += 1
                self.episode_step += 1
                self.reward = 0.
                accumulated_info = {}

                # Run a single step.
                callbacks.on_step_begin(self.episode_step)
                # This is were all of the work happens. We first perceive and compute the action
                # (forward step) and then use the reward to improve (backward step).

                # state_0 -- (foward) --> action
                action = self.forward(observation_0)
                # Process the action
                action = self.processor.process_action(action)

                # action -- (step) --> (reward, state_1, terminal)
                # Apply the action
                # With repetition, if necesarry
                for _ in range(action_repetition):
                    callbacks.on_action_begin(action)
                    observation_1, r, self.done, info = env.step(action)
                    # observation_1 = deepcopy(observation_1)

                    observation_1, r, self.done, info = self.processor.process_step(
                        observation_1, r, self.done, info)
                    for key, value in info.items():
                        if not np.isreal(value):
                            continue
                        if key not in accumulated_info:
                            accumulated_info[key] = np.zeros_like(value)
                        accumulated_info[key] += value
                    callbacks.on_action_end(action)

                    self.reward += r

                    # Set episode as finished if the environment has terminated
                    if self.done:
                        break

                # Scale the reward
                self.reward = self.reward * reward_scaling
                self.episode_reward += self.reward

                # End of the step
                # Stop episode if reached the step limit
                if nb_max_episode_steps and self.episode_step >= nb_max_episode_steps:
                    # Force a terminal state.
                    self.done = True

                # Post step: training, callbacks and hooks
                # Train the algorithm
                metrics, self.step_summaries = self.backward(
                    observation_0,
                    action,
                    self.reward,
                    observation_1,
                    terminal=self.done)

                # Hooks
                hooks()

                # Callbacks
                # Collect statistics
                step_logs = {
                    'action': action,
                    'observation': observation_1,
                    'reward': self.reward,
                    'metrics': metrics,
                    'episode': self.episode,
                    'info': accumulated_info,
                }
                callbacks.on_step_end(self.episode_step, step_logs)

                # Episodic callbacks
                if self.done:
                    # Collect statistics
                    episode_logs = {
                        'episode_reward': np.float_(self.episode_reward),
                        'nb_episode_steps': np.float_(self.episode_step),
                        'nb_steps': np.float_(self.step),
                    }
                    callbacks.on_episode_end(self.episode, logs=episode_logs)

        except KeyboardInterrupt:
            # We catch keyboard interrupts here so that training can be be safely aborted.
            # This is so common that we've built this right into this function, which ensures that
            # the `on_train_end` method is properly called.
            did_abort = True

        callbacks.on_train_end(logs={'did_abort': did_abort})
        self._on_train_end()

        return (history)
Exemplo n.º 23
0
    def fit(self,
            env,
            nb_steps,
            action_repetition=1,
            callbacks=None,
            verbose=1,
            visualize=False,
            nb_max_start_steps=0,
            start_step_policy=None,
            log_interval=10000,
            nb_max_episode_steps=None,
            stepper=False):
        if not self.compiled:
            raise RuntimeError(
                'Your tried to fit your agent but it hasn\'t been compiled yet. Please call `compile()` before `fit()`.'
            )
        if action_repetition < 1:
            raise ValueError('action_repetition must be >= 1, is {}'.format(
                action_repetition))

        self.training = True
        self.stepper = stepper

        callbacks = [] if not callbacks else callbacks[:]

        if verbose == 1:
            callbacks += [TrainIntervalLogger(interval=log_interval)]
        elif verbose > 1:
            callbacks += [TrainEpisodeLogger()]
        if visualize:
            callbacks += [Visualizer()]
        history = History()
        callbacks += [history]
        callbacks = CallbackList(callbacks)
        if hasattr(callbacks, 'set_model'):
            callbacks.set_model(self)
        else:
            callbacks._set_model(self)
        callbacks._set_env(env)
        params = {
            'nb_steps': nb_steps,
        }
        if hasattr(callbacks, 'set_params'):
            callbacks.set_params(params)
        else:
            callbacks._set_params(params)
        self._on_train_begin()
        callbacks.on_train_begin()

        episode = 0
        self.step = 0
        observation = None
        episode_reward = None
        episode_step = None
        did_abort = False
        try:
            while self.step < nb_steps:
                penalty = 0
                if observation is None:  # start of a new episode
                    callbacks.on_episode_begin(episode)
                    episode_step = 0
                    episode_reward = 0.

                    # Obtain the initial observation by resetting the environment.
                    self.reset_states()
                    observation = deepcopy(env.reset())
                    if self.processor is not None:
                        observation = self.processor.process_observation(
                            observation)
                    assert observation is not None

                    # Perform random starts at beginning of episode and do not record them into the experience.
                    # This slightly changes the start position between games.
                    nb_random_start_steps = 0 if nb_max_start_steps == 0 else np.random.randint(
                        nb_max_start_steps)
                    for _ in range(nb_random_start_steps):
                        if self.manual:
                            action = int(raw_input("action?\n"))
                        elif start_step_policy is None:
                            action = env.action_space.sample()
                        else:
                            action = start_step_policy(observation)
                            if self.shield is not None:
                                if self.maze:
                                    inp = get_input_maze(observation)
                                else:
                                    inp = get_input(observation)
                                action_bin = to_bin(action)
                                if not self.huge_neg:
                                    action = self.shield(
                                        inp[0], inp[1], inp[2], action_bin[0],
                                        action_bin[1], action_bin[2])
                                elif self.huge_neg:
                                    if to_int(
                                            self.shield(
                                                inp[0], inp[1], inp[2],
                                                action_bin[0], action_bin[1],
                                                action_bin[2])) != action:
                                        penalty = -10
                        if self.processor is not None:
                            action = self.processor.process_action(action)
                        callbacks.on_action_begin(action)
                        if self.stepper:
                            action = int(raw_input("action?\n"))
                        observation, reward, done, info = env.step(action)
                        observation = deepcopy(observation)
                        if self.processor is not None:
                            observation, reward, done, info = self.processor.process_step(
                                observation, reward, done, info)
                        callbacks.on_action_end(action)
                        if done:
                            warnings.warn(
                                'Env ended before {} random steps could be performed at the start. You should probably lower the `nb_max_start_steps` parameter.'
                                .format(nb_random_start_steps))
                            observation = deepcopy(env.reset())
                            if self.processor is not None:
                                observation = self.processor.process_observation(
                                    observation)
                            break

                # At this point, we expect to be fully initialized.
                assert episode_reward is not None
                assert episode_step is not None
                assert observation is not None

                # Run a single step.
                callbacks.on_step_begin(episode_step)
                # This is were all of the work happens. We first perceive and compute the action
                # (forward step) and then use the reward to improve (backward step).
                #print observation
                if self.manual:
                    oldaction = self.forward(observation, manual=True)
                elif self.preemptive:
                    banned_actions = []
                    inp = get_input(observation)
                    for an_action in range(0, 8):
                        an_action_bin = to_bin(an_action)
                        action = to_int(
                            self.shield.move(inp[0], inp[1], inp[2], inp[3],
                                             an_action_bin[0],
                                             an_action_bin[1],
                                             an_action_bin[2]))
                        if action != an_action:
                            banned_actions.append(an_action)
                    oldaction = self.forward(observation,
                                             manual=False,
                                             banned_actions=banned_actions)
                else:
                    oldaction = self.forward(observation, manual=False)
                    # print oldaction
                if self.shield is not None:
                    if self.maze:
                        inp = get_input_maze(observation)
                    else:
                        inp = get_input(observation)
                    action_bin = to_bin(oldaction)
                    #sleep(0.01)
                    pass
                    if self.preemptive:
                        action = oldaction
                    elif not self.huge_neg:
                        action = to_int(
                            self.shield.move(inp[0], inp[1], inp[2], inp[3],
                                             action_bin[0], action_bin[1],
                                             action_bin[2]))
                    elif self.huge_neg:
                        if to_int(
                                self.shield(inp[0], inp[1], inp[2],
                                            action_bin[0], action_bin[1],
                                            action_bin[2])) != action:
                            penalty = -10
                            action = oldaction
                else:
                    action = oldaction
                #print action, oldaction
                if self.processor is not None:
                    action = self.processor.process_action(action)
                reward = 0.
                accumulated_info = {}
                done = False
                for _ in range(action_repetition):
                    callbacks.on_action_begin(action)
                    observation, r, done, info = env.step(action)
                    observation = deepcopy(observation)
                    if self.processor is not None:
                        observation, r, done, info = self.processor.process_step(
                            observation, r + penalty, done, info)
                    for key, value in info.items():
                        if not np.isreal(value):
                            continue
                        if key not in accumulated_info:
                            accumulated_info[key] = np.zeros_like(value)
                        accumulated_info[key] += value
                    callbacks.on_action_end(action)
                    reward += r + penalty
                    if done:
                        break
                if nb_max_episode_steps and episode_step >= nb_max_episode_steps - 1:
                    # Force a terminal state.
                    done = True
                metrics = self.backward(reward, terminal=done)
                episode_reward += reward

                step_logs = {
                    'action': action,
                    'observation': observation,
                    'reward': reward,
                    'metrics': metrics,
                    'episode': episode,
                    'info': accumulated_info,
                }
                oldstep_logs = {
                    'action': oldaction,
                    'observation': observation,
                    'reward': -1,
                    'metrics': metrics,
                    'episode': episode,
                    'info': accumulated_info,
                }
                # if correction:
                #     callbacks.on_step_end(episode_step, oldstep_logs)
                #     episode_step += 1
                #     self.step += 1

                callbacks.on_step_end(episode_step, step_logs)
                episode_step += 1
                self.step += 1

                if done:
                    # We are in a terminal state but the agent hasn't yet seen it. We therefore
                    # perform one more forward-backward call and simply ignore the action before
                    # resetting the environment. We need to pass in `terminal=False` here since
                    # the *next* state, that is the state of the newly reset environment, is
                    # always non-terminal by convention.
                    self.forward(observation)
                    self.backward(0., terminal=False)

                    # This episode is finished, report and reset.
                    episode_logs = {
                        'episode_reward': episode_reward,
                        'nb_episode_steps': episode_step,
                        'nb_steps': self.step,
                    }
                    callbacks.on_episode_end(episode, episode_logs)

                    episode += 1
                    observation = None
                    episode_step = None
                    episode_reward = None
        except KeyboardInterrupt:
            # We catch keyboard interrupts here so that training can be be safely aborted.
            # This is so common that we've built this right into this function, which ensures that
            # the `on_train_end` method is properly called.
            did_abort = True
        callbacks.on_train_end(logs={'did_abort': did_abort})
        self._on_train_end()

        return history
Exemplo n.º 24
0
    def fit(
        self,
        env,
        nb_steps,
        action_repetition=1,
        callbacks=[],
        verbose=1,
        visualize=False,
        nb_max_start_steps=0,
        start_step_policy=None,
        log_interval=10000,
        nb_max_episode_steps=None,
    ):
        if not self.compiled:
            raise RuntimeError(
                "Your tried to fit your agent but it hasn't been compiled yet. Please call `compile()` before `fit()`."
            )
        if action_repetition < 1:
            raise ValueError("action_repetition must be >= 1, is {}".format(action_repetition))

        self.training = True

        if verbose == 1:
            callbacks += [TrainIntervalLogger(interval=log_interval)]
        elif verbose > 1:
            callbacks += [TrainEpisodeLogger()]
        if visualize:
            callbacks += [Visualizer()]
        callbacks = CallbackList(callbacks)
        callbacks._set_model(self)
        callbacks._set_env(env)
        callbacks._set_params({"nb_steps": nb_steps})
        callbacks.on_train_begin()

        episode = 0
        self.step = 0
        observation = None
        episode_reward = None
        episode_step = None
        did_abort = False
        try:
            while self.step < nb_steps:
                if observation is None:  # start of a new episode
                    callbacks.on_episode_begin(episode)
                    episode_step = 0
                    episode_reward = 0.0

                    # Obtain the initial observation by resetting the environment.
                    self.reset_states()
                    observation = env.reset()
                    assert observation is not None

                    # Perform random starts at beginning of episode and do not record them into the experience.
                    # This slightly changes the start position between games.
                    nb_random_start_steps = 0 if nb_max_start_steps == 0 else np.random.randint(nb_max_start_steps)
                    for _ in xrange(nb_random_start_steps):
                        if start_step_policy is None:
                            action = env.action_space.sample()
                        else:
                            action = start_step_policy(observation)
                        callbacks.on_action_begin(action)
                        observation, _, done, _ = env.step(action)
                        callbacks.on_action_end(action)
                        if done:
                            warnings.warn(
                                "Env ended before {} random steps could be performed at the start. You should probably lower the `nb_max_start_steps` parameter.".format(
                                    nb_random_start_steps
                                )
                            )
                            observation = env.reset()
                            break

                # At this point, we expect to be fully initialized.
                assert episode_reward is not None
                assert episode_step is not None
                assert observation is not None

                # Run a single step.
                callbacks.on_step_begin(episode_step)
                # This is were all of the work happens. We first perceive and compute the action
                # (forward step) and then use the reward to improve (backward step).
                action = self.forward(observation)
                reward = 0.0
                done = False
                for _ in xrange(action_repetition):
                    callbacks.on_action_begin(action)
                    observation, r, done, _ = env.step(action)
                    callbacks.on_action_end(action)
                    reward += r
                    if done:
                        break
                metrics = self.backward(reward, terminal=done)
                episode_reward += reward

                step_logs = {
                    "action": action,
                    "observation": observation,
                    "reward": reward,
                    "metrics": metrics,
                    "episode": episode,
                }
                callbacks.on_step_end(episode_step, step_logs)
                episode_step += 1
                self.step += 1

                if done or (nb_max_episode_steps and episode_step > nb_max_episode_steps):
                    # This episode is finished, report and reset.
                    episode_logs = {
                        "episode_reward": episode_reward,
                        "nb_episode_steps": episode_step,
                        "nb_steps": self.step,
                    }
                    callbacks.on_episode_end(episode, episode_logs)

                    episode += 1
                    observation = None
                    episode_step = None
                    episode_reward = None
        except KeyboardInterrupt:
            # We catch keyboard interrupts here so that training can be be safely aborted.
            # This is so common that we've built this right into this function, which ensures that
            # the `on_train_end` method is properly called.
            did_abort = True
        callbacks.on_train_end(logs={"did_abort": did_abort})
    def test(self, env, nb_episodes=1, action_repetition=1, callbacks=None, visualize=True,
             nb_max_episode_steps=None, nb_max_start_steps=0, start_step_policy=None, verbose=1):
        """Callback that is called before training begins.

        # Arguments
            env: (`Env` instance): Environment that the agent interacts with. See [Env](#env) for details.
            nb_episodes (integer): Number of episodes to perform.
            action_repetition (integer): Number of times the agent repeats the same action without
                observing the environment again. Setting this to a value > 1 can be useful
                if a single action only has a very small effect on the environment.
            callbacks (list of `keras.callbacks.Callback` or `rl.callbacks.Callback` instances):
                List of callbacks to apply during training. See [callbacks](/callbacks) for details.
            verbose (integer): 0 for no logging, 1 for interval logging (compare `log_interval`), 2 for episode logging
            visualize (boolean): If `True`, the environment is visualized during training. However,
                this is likely going to slow down training significantly and is thus intended to be
                a debugging instrument.
            nb_max_start_steps (integer): Number of maximum steps that the agent performs at the beginning
                of each episode using `start_step_policy`. Notice that this is an upper limit since
                the exact number of steps to be performed is sampled uniformly from [0, max_start_steps]
                at the beginning of each episode.
            start_step_policy (`lambda observation: action`): The policy
                to follow if `nb_max_start_steps` > 0. If set to `None`, a random action is performed.
            log_interval (integer): If `verbose` = 1, the number of steps that are considered to be an interval.
            nb_max_episode_steps (integer): Number of steps per episode that the agent performs before
                automatically resetting the environment. Set to `None` if each episode should run
                (potentially indefinitely) until the environment signals a terminal state.

        # Returns
            A `keras.callbacks.History` instance that recorded the entire training process.
        """
        if not self.compiled:
            raise RuntimeError('Your tried to test your agent but it hasn\'t been compiled yet. Please call `compile()` before `test()`.')
        if action_repetition < 1:
            raise ValueError('action_repetition must be >= 1, is {}'.format(action_repetition))

        self.training = False
        self.step = 0

        callbacks = [] if not callbacks else callbacks[:]

        if verbose >= 1:
            callbacks += [TestLogger()]
        if visualize:
            callbacks += [Visualizer()]
        history = History()
        callbacks += [history]
        callbacks = CallbackList(callbacks)
        if hasattr(callbacks, 'set_model'):
            callbacks.set_model(self)
        else:
            callbacks._set_model(self)
        callbacks._set_env(env)
        params = {
            'nb_episodes': nb_episodes,
        }
        if hasattr(callbacks, 'set_params'):
            callbacks.set_params(params)
        else:
            callbacks._set_params(params)

        self._on_test_begin()
        callbacks.on_train_begin()
        for episode in range(nb_episodes):
            callbacks.on_episode_begin(episode)
            episode_reward = 0.
            episode_step = 0

            # Obtain the initial observation by resetting the environment.
            self.reset_states()
            observation = deepcopy(env.reset())
            if self.processor is not None:
                observation = self.processor.process_observation(observation)
            assert observation is not None

            # Perform random starts at beginning of episode and do not record them into the experience.
            # This slightly changes the start position between games.
            nb_random_start_steps = 0 if nb_max_start_steps == 0 else np.random.randint(nb_max_start_steps)
            for _ in range(nb_random_start_steps):
                if start_step_policy is None:
                    action = env.action_space.sample()
                else:
                    action = start_step_policy(observation)
                if self.processor is not None:
                    action = self.processor.process_action(action)
                callbacks.on_action_begin(action)
                observation, r, done, info = env.step(action)
                observation = deepcopy(observation)
                if self.processor is not None:
                    observation, r, done, info = self.processor.process_step(observation, r, done, info)
                callbacks.on_action_end(action)
                if done:
                    warnings.warn('Env ended before {} random steps could be performed at the start. You should probably lower the `nb_max_start_steps` parameter.'.format(nb_random_start_steps))
                    observation = deepcopy(env.reset())
                    if self.processor is not None:
                        observation = self.processor.process_observation(observation)
                    break

            # Run the episode until we're done.
            done = False

            first = 1
            ct0 = 0
            while not done:
                callbacks.on_step_begin(episode_step)

                action = self.forward(observation)
                # if first <= 5:
                #     action = env.action_space.sample()
                #     first += 1
                #print("~: ",action)
                #print(done)
                if(action == 0):
                    ct0 += 1
                else:
                    ct0 = 0
                if(ct0 > 15):
                    action = env.action_space.sample()
                    ct0 = 0

                #####
                if self.processor is not None:
                    action = self.processor.process_action(action)
                reward = 0.
                accumulated_info = {}
                for _ in range(action_repetition):
                    callbacks.on_action_begin(action)
                    observation, r, d, info = env.step(action)
                    observation = deepcopy(observation)
                    if self.processor is not None:
                        observation, r, d, info = self.processor.process_step(observation, r, d, info)
                    callbacks.on_action_end(action)
                    reward += r
                    for key, value in info.items():
                        if not np.isreal(value):
                            continue
                        if key not in accumulated_info:
                            accumulated_info[key] = np.zeros_like(value)
                        accumulated_info[key] += value
                    if d:
                        done = True
                        break
                if nb_max_episode_steps and episode_step >= nb_max_episode_steps - 1:
                    done = True
                self.backward(reward, terminal=done)
                episode_reward += reward

                step_logs = {
                    'action': action,
                    'observation': observation,
                    'reward': reward,
                    'episode': episode,
                    'info': accumulated_info,
                }
                callbacks.on_step_end(episode_step, step_logs)
                episode_step += 1
                self.step += 1

            # We are in a terminal state but the agent hasn't yet seen it. We therefore
            # perform one more forward-backward call and simply ignore the action before
            # resetting the environment. We need to pass in `terminal=False` here since
            # the *next* state, that is the state of the newly reset environment, is
            # always non-terminal by convention.
            self.forward(observation)
            self.backward(0., terminal=False)

            # Report end of episode.
            episode_logs = {
                'episode_reward': episode_reward,
                'nb_steps': episode_step,
            }
            callbacks.on_episode_end(episode, episode_logs)
        callbacks.on_train_end()
        self._on_test_end()

        return history
Exemplo n.º 26
0
    def fit(self,
            env,
            nb_steps,
            action_repetition=1,
            callbacks=None,
            verbose=1,
            visualize=False,
            nb_max_start_steps=0,
            start_step_policy=None,
            log_interval=2000000,
            nb_max_episode_steps=None,
            nb_episodes=10000):
        self.training = True
        callbacks = [] if not callbacks else callbacks[:]
        if verbose == 1:
            callbacks += [TrainIntervalLogger(interval=log_interval)]
        elif verbose > 1:
            callbacks += [TrainEpisodeLogger()]
        if visualize:
            callbacks += [Visualizer()]
        history = History()
        callbacks += [history]
        callbacks = CallbackList(callbacks)
        if hasattr(callbacks, 'set_model'):
            callbacks.set_model(self)
        else:
            callbacks._set_model(self)
        callbacks._set_env(env)
        params = {
            'nb_steps': nb_steps,
        }
        if hasattr(callbacks, 'set_params'):
            callbacks.set_params(params)
        else:
            callbacks._set_params(params)
        self._on_train_begin()
        callbacks.on_train_begin()
        episode = 0
        self.step = 0
        episode_reward = 0
        episode_step = 0
        did_abort = False
        if load_weight:
            self.load_weights(file_path="")

        if self.training:
            self.epsilon = self.startE
        else:
            self.epsilon = self.evaluateE
        try:
            while self.step < nb_steps:
                callbacks.on_episode_begin(episode)

                # Obtain the initial observation by resetting the environment.
                observation = env.env.getState()
                if self.processor is not None:
                    observation = self.processor.process_observation(
                        observation)
                assert observation is not None
                assert episode_reward is not None
                assert episode_step is not None
                callbacks.on_step_begin(episode_step)
                # This is were all of the work happens. We first perceive and compute the action
                # (forward step) and then use the reward to improve (backward step).
                action = self.forward(observation, env)
                reward = 0.
                accumulated_info = {}
                callbacks.on_action_begin(action)
                observation, r, done, info = env.step(action)
                observation = deepcopy(observation)
                if self.processor is not None:
                    observation, r, done, info = self.processor.process_step(
                        observation, r, done, info)

                callbacks.on_action_end(action)
                reward += r
                metrics = self.backward(reward, terminal=done)
                episode_reward += reward
                print 'reward: ' + str(reward)
                step_logs = {
                    'action': action,
                    'observation': observation,
                    'reward': reward,
                    'metrics': metrics,
                    'episode': episode,
                    'info': accumulated_info,
                }
                callbacks.on_step_end(episode_step, step_logs)
                episode_step += 1
                self.step += 1
                if done:
                    # This episode is finished, report and reset.
                    episode_logs = {
                        'episode_reward': episode_reward,
                        'nb_episode_steps': episode_step,
                        'nb_steps': self.step,
                    }
                    callbacks.on_episode_end(episode, episode_logs)
                    episode_step = 0
                    episode_reward = 0
                    episode += 1
                    env.reset()
                    if np.mod(episode, 10) == 0 and self.training:
                        self.save_weights(file_path="", overwrite=True)

        except KeyboardInterrupt:
            # We catch keyboard interrupts here so that training can be be safely aborted.
            # This is so common that we've built this right into this function, which ensures that
            # the `on_train_end` method is properly called.
            did_abort = True
        callbacks.on_train_end(logs={'did_abort': did_abort})
        self._on_train_end()

        return history
Exemplo n.º 27
0
def fit_new(self,
            env,
            nb_steps,
            action_repetition=1,
            callbacks=None,
            verbose=1,
            visualize=False,
            nb_max_start_steps=0,
            start_step_policy=None,
            log_interval=10000,
            nb_max_episode_steps=None,
            arr=None):
    print 'FIT CHANGED ... Yayyyyy!!!!'
    """Trains the agent on the given environment.
        # Arguments
            env: (`Env` instance): Environment that the agent interacts with. See [Env](#env) for details.
            nb_steps (integer): Number of training steps to be performed.
            action_repetition (integer): Number of times the agent repeats the same action without
                observing the environment again. Setting this to a value > 1 can be useful
                if a single action only has a very small effect on the environment.
            callbacks (list of `keras.callbacks.Callback` or `rl.callbacks.Callback` instances):
                List of callbacks to apply during training. See [callbacks](/callbacks) for details.
            verbose (integer): 0 for no logging, 1 for interval logging (compare `log_interval`), 2 for episode logging
            visualize (boolean): If `True`, the environment is visualized during training. However,
                this is likely going to slow down training significantly and is thus intended to be
                a debugging instrument.
            nb_max_start_steps (integer): Number of maximum steps that the agent performs at the beginning
                of each episode using `start_step_policy`. Notice that this is an upper limit since
                the exact number of steps to be performed is sampled uniformly from [0, max_start_steps]
                at the beginning of each episode.
            start_step_policy (`lambda observation: action`): The policy
                to follow if `nb_max_start_steps` > 0. If set to `None`, a random action is performed.
            log_interval (integer): If `verbose` = 1, the number of steps that are considered to be an interval.
            nb_max_episode_steps (integer): Number of steps per episode that the agent performs before
                automatically resetting the environment. Set to `None` if each episode should run
                (potentially indefinitely) until the environment signals a terminal state.
        # Returns
            A `keras.callbacks.History` instance that recorded the entire training process.
        """
    if not self.compiled:
        raise RuntimeError(
            'Your tried to fit your agent but it hasn\'t been compiled yet. Please call `compile()` before `fit()`.'
        )
    if action_repetition < 1:
        raise ValueError(
            'action_repetition must be >= 1, is {}'.format(action_repetition))

    self.training = True

    callbacks = [] if not callbacks else callbacks[:]

    if verbose == 1:
        callbacks += [TrainIntervalLogger(interval=log_interval)]
    elif verbose > 1:
        callbacks += [TrainEpisodeLogger()]
    if visualize:
        callbacks += [Visualizer()]
    history = History()
    callbacks += [history]
    callbacks = CallbackList(callbacks)
    if hasattr(callbacks, 'set_model'):
        callbacks.set_model(self)
    else:
        callbacks._set_model(self)
    callbacks._set_env(env)
    params = {
        'nb_steps': nb_steps,
    }
    if hasattr(callbacks, 'set_params'):
        callbacks.set_params(params)
    else:
        callbacks._set_params(params)
    self._on_train_begin()
    callbacks.on_train_begin()

    episode = 0
    self.step = 0
    observation = None
    episode_reward = None
    episode_step = None
    did_abort = False
    try:
        while self.step < nb_steps:
            if observation is None:  # start of a new episode
                callbacks.on_episode_begin(episode)
                episode_step = 0
                episode_reward = 0.

                # Obtain the initial observation by resetting the environment.
                self.reset_states()
                observation = deepcopy(env.reset())

                if self.processor is not None:
                    observation = self.processor.process_observation(
                        observation)
                assert observation is not None

                ############### HERE ##################
                for ac in arr[:]:
                    # print type(ac), ac
                    if self.processor is not None:
                        ac = self.processor.process_action(ac)
                    callbacks.on_action_begin(ac)
                    observation, reward, done, info = env.step(ac)
                    observation = deepcopy(observation)
                    if self.processor is not None:
                        observation, reward, done, info = self.processor.process_step(
                            observation, reward, done, info)
                    callbacks.on_action_end(ac)
                    if done:
                        #warnings.warn('Env ended before the deterministic non-neural steps could end.')
                        observation = deepcopy(env.reset())
                        if self.processor is not None:
                            observation = self.processor.process_observation(
                                observation)
                        break

                #############

                # Perform random starts at beginning of episode and do not record them into the experience.
                # This slightly changes the start position between games.
                nb_random_start_steps = 0 if nb_max_start_steps == 0 else np.random.randint(
                    nb_max_start_steps)
                for _ in range(nb_random_start_steps):
                    if start_step_policy is None:
                        action = env.action_space.sample()
                    else:
                        action = start_step_policy(observation)
                    if self.processor is not None:
                        action = self.processor.process_action(action)
                    callbacks.on_action_begin(action)
                    observation, reward, done, info = env.step(action)
                    observation = deepcopy(observation)
                    if self.processor is not None:
                        observation, reward, done, info = self.processor.process_step(
                            observation, reward, done, info)
                    callbacks.on_action_end(action)
                    if done:
                        warnings.warn(
                            'Env ended before {} random steps could be performed at the start. You should probably lower the `nb_max_start_steps` parameter.'
                            .format(nb_random_start_steps))
                        observation = deepcopy(env.reset())
                        if self.processor is not None:
                            observation = self.processor.process_observation(
                                observation)
                        break

            # At this point, we expect to be fully initialized.
            assert episode_reward is not None
            assert episode_step is not None
            assert observation is not None

            # Run a single step.
            callbacks.on_step_begin(episode_step)
            # This is were all of the work happens. We first perceive and compute the action
            # (forward step) and then use the reward to improve (backward step).
            action = self.forward(observation)
            if self.processor is not None:
                action = self.processor.process_action(action)
            reward = 0.
            accumulated_info = {}
            done = False
            for _ in range(action_repetition):
                callbacks.on_action_begin(action)
                observation, r, done, info = env.step(action)
                observation = deepcopy(observation)
                if self.processor is not None:
                    observation, r, done, info = self.processor.process_step(
                        observation, r, done, info)
                for key, value in info.items():
                    if not np.isreal(value):
                        continue
                    if key not in accumulated_info:
                        accumulated_info[key] = np.zeros_like(value)
                    accumulated_info[key] += value
                callbacks.on_action_end(action)
                reward += r
                if done:
                    break
            if nb_max_episode_steps and episode_step >= nb_max_episode_steps - 1:
                # Force a terminal state.
                done = True
            metrics = self.backward(reward, terminal=done)
            episode_reward += reward

            step_logs = {
                'action': action,
                'observation': observation,
                'reward': reward,
                'metrics': metrics,
                'episode': episode,
                'info': accumulated_info,
            }
            callbacks.on_step_end(episode_step, step_logs)
            episode_step += 1
            self.step += 1

            if done:
                # We are in a terminal state but the agent hasn't yet seen it. We therefore
                # perform one more forward-backward call and simply ignore the action before
                # resetting the environment. We need to pass in `terminal=False` here since
                # the *next* state, that is the state of the newly reset environment, is
                # always non-terminal by convention.
                self.forward(observation)
                self.backward(0., terminal=False)

                # This episode is finished, report and reset.
                episode_logs = {
                    'episode_reward': episode_reward,
                    'nb_episode_steps': episode_step,
                    'nb_steps': self.step,
                }
                callbacks.on_episode_end(episode, episode_logs)

                episode += 1
                observation = None
                episode_step = None
                episode_reward = None
    except KeyboardInterrupt:
        # We catch keyboard interrupts here so that training can be be safely aborted.
        # This is so common that we've built this right into this function, which ensures that
        # the `on_train_end` method is properly called.
        did_abort = True
    callbacks.on_train_end(logs={'did_abort': did_abort})
    self._on_train_end()

    return history
Exemplo n.º 28
0
    def fit(self,
            env,
            nb_steps,
            action_repetition=1,
            callbacks=None,
            verbose=1,
            visualize=False,
            nb_max_start_steps=0,
            start_step_policy=None,
            log_interval=10000,
            nb_max_episode_steps=None,
            episode_averaging_length=10,
            success_threshold=None,
            stopping_patience=None,
            min_nb_steps=500,
            single_cycle=True):
        """Trains the agent on the given environment.

        # Arguments
            env: (`Env` instance): Environment that the agent interacts with. See [Env](#env) for details.
            nb_steps (integer): Number of training steps to be performed.
            action_repetition (integer): Number of times the agent repeats the same action without
                observing the environment again. Setting this to a value > 1 can be useful
                if a single action only has a very small effect on the environment.
            callbacks (list of `keras.callbacks.Callback` or `rl.callbacks.Callback` instances):
                List of callbacks to apply during training. See [callbacks](/callbacks) for details.
            verbose (integer): 0 for no logging, 1 for interval logging (compare `log_interval`), 2 for episode logging
            visualize (boolean): If `True`, the environment is visualized during training. However,
                this is likely going to slow down training significantly and is thus intended to be
                a debugging instrument.
            nb_max_start_steps (integer): Number of maximum steps that the agent performs at the beginning
                of each episode using `start_step_policy`. Notice that this is an upper limit since
                the exact number of steps to be performed is sampled uniformly from [0, max_start_steps]
                at the beginning of each episode.
            start_step_policy (`lambda observation: action`): The policy
                to follow if `nb_max_start_steps` > 0. If set to `None`, a random action is performed.
            log_interval (integer): If `verbose` = 1, the number of steps that are considered to be an interval.
            nb_max_episode_steps (integer): Number of steps per episode that the agent performs before
                automatically resetting the environment. Set to `None` if each episode should run
                (potentially indefinitely) until the environment signals a terminal state.

        # Returns
            A `keras.callbacks.History` instance that recorded the entire training process.
        """
        if not self.compiled:
            raise RuntimeError(
                'Your tried to fit your agent but it hasn\'t been compiled yet. Please call `compile()` before `fit()`.'
            )
        if action_repetition < 1:
            raise ValueError('action_repetition must be >= 1, is {}'.format(
                action_repetition))

        self.training = True

        callbacks = [] if not callbacks else callbacks[:]

        for cb in callbacks:
            if isinstance(cb, FileLogger):
                save_path = cb.filepath
                folder_index = save_path.index("training_history.json")
                weights_file = os.path.join(save_path[:folder_index],
                                            "dqn_weights.h5f")

        if verbose == 1:
            callbacks += [TrainIntervalLogger(interval=log_interval)]
        elif verbose > 1:
            callbacks += [TrainEpisodeLogger(interval=log_interval)]
        if visualize:
            callbacks += [Visualizer()]
        history = History()
        callbacks += [history]
        callbacks = CallbackList(callbacks)
        if hasattr(callbacks, 'set_model'):
            callbacks.set_model(self)
        else:
            callbacks._set_model(self)
        callbacks._set_env(env)
        params = {
            'nb_steps': nb_steps,
        }
        if hasattr(callbacks, 'set_params'):
            callbacks.set_params(params)
        else:
            callbacks._set_params(params)
        self._on_train_begin()
        callbacks.on_train_begin()

        episode = np.int16(0)
        self.step = np.int16(0)
        observation = None
        episode_reward = None
        episode_step = None
        episode_num_errors = None
        did_abort = False

        # ------ Early stopping and reporting averages ------------------
        #
        # It would be ideal to do this via a callback, but returning flags from callbacks seems tricky. Eish!
        # So, we automatically include early stopping here in the fit method.
        # NB: We have hardcoded in something which is probably not ideal to hard code, but I just want it
        # to work, and can fix things and make them nicer/more flexible at a later stage!
        #
        # --------------------------------------------------------------

        if not single_cycle:

            recent_episode_lifetimes = deque([], episode_averaging_length)
            episode_lifetimes_rolling_avg = 0
            best_rolling_avg = 0
            best_episode = 0
            time_since_best = 0

        elif single_cycle:

            recent_episode_wins = deque([], episode_averaging_length)
            best_rolling_avg = 0
            best_episode = 0
            time_since_best = 0
            rolling_win_fraction = 0

        stop_training = False
        has_succeeded = False
        stopped_improving = False

        try:
            while self.step < nb_steps and not stop_training:
                if observation is None:  # start of a new episode
                    callbacks.on_episode_begin(episode)
                    episode_step = np.int16(0)
                    episode_reward = np.float32(0)

                    # Obtain the initial observation by resetting the environment.
                    self.reset_states()
                    observation = deepcopy(env.reset())
                    # print("Episode Step:", episode_step)
                    # print("hidden state: ")
                    # print(env.hidden_state)
                    # print("Board State: ")
                    # print(observation)
                    if self.processor is not None:
                        observation = self.processor.process_observation(
                            observation)
                    assert observation is not None

                    # Perform random starts at beginning of episode and do not record them into the experience.
                    # This slightly changes the start position between games.
                    nb_random_start_steps = 0 if nb_max_start_steps == 0 else np.random.randint(
                        nb_max_start_steps)
                    for _ in range(nb_random_start_steps):
                        if start_step_policy is None:
                            action = env.action_space.sample()
                        else:
                            action = start_step_policy(observation)
                        if self.processor is not None:
                            action = self.processor.process_action(action)
                        callbacks.on_action_begin(action)
                        observation, reward, done, info = env.step(action)
                        observation = deepcopy(observation)
                        if self.processor is not None:
                            observation, reward, done, info = self.processor.process_step(
                                observation, reward, done, info)
                        callbacks.on_action_end(action)
                        if done:
                            warnings.warn(
                                'Env ended before {} random steps could be performed at the start. You should probably lower the `nb_max_start_steps` parameter.'
                                .format(nb_random_start_steps))
                            observation = deepcopy(env.reset())
                            if self.processor is not None:
                                observation = self.processor.process_observation(
                                    observation)
                            break

                # At this point, we expect to be fully initialized.
                assert episode_reward is not None
                assert episode_step is not None
                assert observation is not None

                # print("Episode Step:", episode_step)
                # Run a single step.
                callbacks.on_step_begin(episode_step)
                # This is were all of the work happens. We first perceive and compute the action
                # (forward step) and then use the reward to improve (backward step).
                if hasattr(env, "legal_actions"):
                    legal_actions = list(env.legal_actions)
                    action = self.forward(observation, legal_actions)
                    # print("legal actions: ", legal_actions)
                    # print("chosen action: ", action)
                else:
                    action = self.forward(observation)
                if self.processor is not None:
                    action = self.processor.process_action(action)
                reward = np.float32(0)
                accumulated_info = {}
                done = False
                for _ in range(action_repetition):
                    callbacks.on_action_begin(action)
                    observation, r, done, info = env.step(action)
                    observation = deepcopy(observation)
                    if self.processor is not None:
                        observation, r, done, info = self.processor.process_step(
                            observation, r, done, info)
                    for key, value in info.items():
                        if not np.isreal(value):
                            continue
                        if key not in accumulated_info:
                            accumulated_info[key] = np.zeros_like(value)
                        accumulated_info[key] += value
                    callbacks.on_action_end(action)
                    reward += r
                    if done:
                        break
                if nb_max_episode_steps and episode_step >= nb_max_episode_steps - 1:
                    # Force a terminal state.
                    done = True
                metrics = self.backward(reward, terminal=done)
                episode_reward += reward

                # print("new hidden state: ")
                # print(env.hidden_state)
                # print("new board state: ")
                # print(observation)
                # print("reward: ", r, "episode reward: ", episode_reward)
                # print("done: ", done)

                step_logs = {
                    'action': action,
                    'observation': observation,
                    'reward': reward,
                    'metrics': metrics,
                    'episode': episode,
                    'info': accumulated_info,
                }
                callbacks.on_step_end(episode_step, step_logs)
                episode_step += 1
                self.step += 1

                if done:
                    # We are in a terminal state but the agent hasn't yet seen it. We therefore
                    # perform one more forward-backward call and simply ignore the action before
                    # resetting the environment. We need to pass in `terminal=False` here since
                    # the *next* state, that is the state of the newly reset environment, is
                    # always non-terminal by convention.

                    action = self.forward(observation)
                    self.backward(0., terminal=False)

                    # Now we want to work out the recent averages, this will go into early stopping

                    if not single_cycle:

                        recent_episode_lifetimes.append(env.lifetime)
                        episode_lifetimes_rolling_avg = np.mean(
                            recent_episode_lifetimes)

                        if episode_lifetimes_rolling_avg > best_rolling_avg:
                            best_rolling_avg = episode_lifetimes_rolling_avg
                            best_episode = episode
                            time_since_best = 0
                        else:
                            time_since_best = episode - best_episode

                        if episode_lifetimes_rolling_avg > success_threshold:
                            stop_training = True
                            has_succeeded = True

                        if self.step > min_nb_steps and time_since_best > stopping_patience:
                            stop_training = True
                            stopped_improving = True

                    else:

                        if episode_reward == 1:
                            recent_episode_wins.append(1)
                        else:
                            recent_episode_wins.append(0)

                        num_wins = np.sum(recent_episode_wins)
                        rolling_win_fraction = num_wins / episode_averaging_length

                        if rolling_win_fraction > best_rolling_avg:
                            best_rolling_avg = rolling_win_fraction
                            best_episode = episode
                            time_since_best = 0

                            # Here I need to add something to save the net - I'm worried this will make things really slow while its improving, because it will be saving every time
                            # For a long time. Eish!
                            if self.step > min_nb_steps:
                                self.save_weights(weights_file, overwrite=True)

                        else:
                            time_since_best = episode - best_episode

                        if rolling_win_fraction > success_threshold:
                            stop_training = True
                            has_succeeded = True

                        if self.step > min_nb_steps and time_since_best > stopping_patience:
                            stop_training = True
                            stopped_improving = True

                    # This episode is finished, report and reset.

                    if not single_cycle:
                        episode_logs = {
                            'episode_reward': episode_reward,
                            'nb_episode_steps': episode_step,
                            'nb_steps': self.step,
                            'episode_lifetimes_rolling_avg':
                            episode_lifetimes_rolling_avg,
                            'best_rolling_avg': best_rolling_avg,
                            'best_episode': best_episode,
                            'time_since_best': time_since_best,
                            'has_succeeded': has_succeeded,
                            'stopped_improving': stopped_improving
                        }

                    else:
                        episode_logs = {
                            'episode_reward': episode_reward,
                            'nb_episode_steps': episode_step,
                            'nb_steps': self.step,
                            'rolling_win_fraction': rolling_win_fraction,
                            'best_rolling_fraction': best_rolling_avg,
                            'best_episode': best_episode,
                            'time_since_best': time_since_best,
                            'has_succeeded': has_succeeded,
                            'stopped_improving': stopped_improving
                        }

                    callbacks.on_episode_end(episode, episode_logs,
                                             single_cycle)

                    episode += 1
                    observation = None
                    episode_step = None
                    episode_reward = None

        except KeyboardInterrupt:
            # We catch keyboard interrupts here so that training can be be safely aborted.
            # This is so common that we've built this right into this function, which ensures that
            # the `on_train_end` method is properly called.
            did_abort = True

        if not single_cycle:
            callbacks.on_train_end(logs={
                'did_abort': did_abort,
                'has_succeeded': has_succeeded,
                'stopped_improving': stopped_improving,
                'episode_lifetimes_rolling_avg': episode_lifetimes_rolling_avg,
                'step': self.step
            },
                                   single_cycle=single_cycle)

        else:
            callbacks.on_train_end(logs={
                'did_abort': did_abort,
                'has_succeeded': has_succeeded,
                'stopped_improving': stopped_improving,
                'rolling_win_fraction': rolling_win_fraction,
                'step': self.step
            },
                                   single_cycle=single_cycle)

        self._on_train_end()

        return history