def fit(self, env, nb_steps, action_repetition=1, callbacks=None, verbose=1, visualize=False, nb_max_start_steps=0, start_step_policy=None, log_interval=10000, nb_max_episode_steps=None): """Trains the agent on the given environment. # Arguments env: (`Env` instance): Environment that the agent interacts with. See [Env](#env) for details. nb_steps (integer): Number of training steps to be performed. action_repetition (integer): Number of times the agent repeats the same action without observing the environment again. Setting this to a value > 1 can be useful if a single action only has a very small effect on the environment. callbacks (list of `keras.callbacks.Callback` or `rl.callbacks.Callback` instances): List of callbacks to apply during training. See [callbacks](/callbacks) for details. verbose (integer): 0 for no logging, 1 for interval logging (compare `log_interval`), 2 for episode logging visualize (boolean): If `True`, the environment is visualized during training. However, this is likely going to slow down training significantly and is thus intended to be a debugging instrument. nb_max_start_steps (integer): Number of maximum steps that the agent performs at the beginning of each episode using `start_step_policy`. Notice that this is an upper limit since the exact number of steps to be performed is sampled uniformly from [0, max_start_steps] at the beginning of each episode. start_step_policy (`lambda observation: action`): The policy to follow if `nb_max_start_steps` > 0. If set to `None`, a random action is performed. log_interval (integer): If `verbose` = 1, the number of steps that are considered to be an interval. nb_max_episode_steps (integer): Number of steps per episode that the agent performs before automatically resetting the environment. Set to `None` if each episode should run (potentially indefinitely) until the environment signals a terminal state. # Returns A `keras.callbacks.History` instance that recorded the entire training process. """ if not self.compiled: raise RuntimeError('Your tried to fit your agent but it hasn\'t been compiled yet. Please call `compile()` before `fit()`.') if action_repetition < 1: raise ValueError('action_repetition must be >= 1, is {}'.format(action_repetition)) self.training = True callbacks = [] if not callbacks else callbacks[:] if verbose == 1: callbacks += [TrainIntervalLogger(interval=log_interval)] elif verbose > 1: callbacks += [TrainEpisodeLogger()] if visualize: callbacks += [Visualizer()] history = History() callbacks += [history] callbacks = CallbackList(callbacks) if hasattr(callbacks, 'set_model'): callbacks.set_model(self) else: callbacks._set_model(self) callbacks._set_env(env) params = { 'nb_steps': nb_steps, } if hasattr(callbacks, 'set_params'): callbacks.set_params(params) else: callbacks._set_params(params) self._on_train_begin() callbacks.on_train_begin() episode = 0 self.step = 0 observation = None episode_reward = None episode_step = None did_abort = False try: while self.step < nb_steps: if observation is None: # start of a new episode callbacks.on_episode_begin(episode) episode_step = 0 episode_reward = 0. # Obtain the initial observation by resetting the environment. self.reset_states() observation = deepcopy(env.reset()) if self.processor is not None: observation = self.processor.process_observation(observation) assert observation is not None # Perform random starts at beginning of episode and do not record them into the experience. # This slightly changes the start position between games. nb_random_start_steps = 0 if nb_max_start_steps == 0 else np.random.randint(nb_max_start_steps) for _ in range(nb_random_start_steps): if start_step_policy is None: action = env.action_space.sample() else: action = start_step_policy(observation) if self.processor is not None: action = self.processor.process_action(action) callbacks.on_action_begin(action) observation, reward, done, info = env.step(action) observation = deepcopy(observation) if self.processor is not None: observation, reward, done, info = self.processor.process_step(observation, reward, done, info) callbacks.on_action_end(action) if done: warnings.warn('Env ended before {} random steps could be performed at the start. You should probably lower the `nb_max_start_steps` parameter.'.format(nb_random_start_steps)) observation = deepcopy(env.reset()) if self.processor is not None: observation = self.processor.process_observation(observation) break # At this point, we expect to be fully initialized. assert episode_reward is not None assert episode_step is not None assert observation is not None # Run a single step. callbacks.on_step_begin(episode_step) # This is were all of the work happens. We first perceive and compute the action # (forward step) and then use the reward to improve (backward step). action = self.forward(observation) if self.processor is not None: action = self.processor.process_action(action) reward = 0. accumulated_info = {} done = False for _ in range(action_repetition): callbacks.on_action_begin(action) observation, r, done, info = env.step(action) observation = deepcopy(observation) if self.processor is not None: observation, r, done, info = self.processor.process_step(observation, r, done, info) for key, value in info.items(): if not np.isreal(value): continue if key not in accumulated_info: accumulated_info[key] = np.zeros_like(value) accumulated_info[key] += value callbacks.on_action_end(action) reward += r if done: break if nb_max_episode_steps and episode_step >= nb_max_episode_steps - 1: # Force a terminal state. done = True metrics = self.backward(reward, terminal=done) episode_reward += reward step_logs = { 'action': action, 'observation': observation, 'reward': reward, 'metrics': metrics, 'episode': episode, 'info': accumulated_info, } callbacks.on_step_end(episode_step, step_logs) episode_step += 1 self.step += 1 if done: # We are in a terminal state but the agent hasn't yet seen it. We therefore # perform one more forward-backward call and simply ignore the action before # resetting the environment. We need to pass in `terminal=False` here since # the *next* state, that is the state of the newly reset environment, is # always non-terminal by convention. self.forward(observation) self.backward(0., terminal=False) # This episode is finished, report and reset. episode_logs = { 'episode_reward': episode_reward, 'nb_episode_steps': episode_step, 'nb_steps': self.step, } callbacks.on_episode_end(episode, episode_logs) episode += 1 observation = None episode_step = None episode_reward = None except KeyboardInterrupt: # We catch keyboard interrupts here so that training can be be safely aborted. # This is so common that we've built this right into this function, which ensures that # the `on_train_end` method is properly called. did_abort = True callbacks.on_train_end(logs={'did_abort': did_abort}) self._on_train_end() return history
def test(self, env, nb_episodes=1, action_repetition=1, callbacks=None, visualize=True, nb_max_episode_steps=None, nb_max_start_steps=0, start_step_policy=None, verbose=1): """Callback that is called before training begins. # Arguments env: (`Env` instance): Environment that the agent interacts with. See [Env](#env) for details. nb_episodes (integer): Number of episodes to perform. action_repetition (integer): Number of times the agent repeats the same action without observing the environment again. Setting this to a value > 1 can be useful if a single action only has a very small effect on the environment. callbacks (list of `keras.callbacks.Callback` or `rl.callbacks.Callback` instances): List of callbacks to apply during training. See [callbacks](/callbacks) for details. verbose (integer): 0 for no logging, 1 for interval logging (compare `log_interval`), 2 for episode logging visualize (boolean): If `True`, the environment is visualized during training. However, this is likely going to slow down training significantly and is thus intended to be a debugging instrument. nb_max_start_steps (integer): Number of maximum steps that the agent performs at the beginning of each episode using `start_step_policy`. Notice that this is an upper limit since the exact number of steps to be performed is sampled uniformly from [0, max_start_steps] at the beginning of each episode. start_step_policy (`lambda observation: action`): The policy to follow if `nb_max_start_steps` > 0. If set to `None`, a random action is performed. log_interval (integer): If `verbose` = 1, the number of steps that are considered to be an interval. nb_max_episode_steps (integer): Number of steps per episode that the agent performs before automatically resetting the environment. Set to `None` if each episode should run (potentially indefinitely) until the environment signals a terminal state. # Returns A `keras.callbacks.History` instance that recorded the entire training process. """ if not self.compiled: raise RuntimeError( 'Your tried to test your agent but it hasn\'t been compiled yet. Please call `compile()` before `test()`.' ) if action_repetition < 1: raise ValueError('action_repetition must be >= 1, is {}'.format( action_repetition)) self.training = False self.step = 0 callbacks = [] if not callbacks else callbacks[:] if verbose >= 1: callbacks += [TestLogger()] if visualize: callbacks += [Visualizer()] history = History() callbacks += [history] callbacks = CallbackList(callbacks) if hasattr(callbacks, 'set_model'): callbacks.set_model(self) else: callbacks._set_model(self) callbacks._set_env(env) params = { 'nb_episodes': nb_episodes, } if hasattr(callbacks, 'set_params'): callbacks.set_params(params) else: callbacks._set_params(params) self._on_test_begin() callbacks.on_train_begin() for episode in range(nb_episodes): callbacks.on_episode_begin(episode) episode_reward = 0. episode_step = 0 # Obtain the initial observation by resetting the environment. self.reset_states() observation = deepcopy(env.reset()) if self.processor is not None: observation = self.processor.process_observation(observation) assert observation is not None # Perform random starts at beginning of episode and do not record them into the experience. # This slightly changes the start position between games. nb_random_start_steps = 0 if nb_max_start_steps == 0 else np.random.randint( nb_max_start_steps) for _ in range(nb_random_start_steps): if start_step_policy is None: action = env.action_space.sample() else: action = start_step_policy(observation) if self.processor is not None: action = self.processor.process_action(action) callbacks.on_action_begin(action) observation, r, done, info = env.step(action) observation = deepcopy(observation) if self.processor is not None: observation, r, done, info = self.processor.process_step( observation, r, done, info) callbacks.on_action_end(action) if done: warnings.warn( 'Env ended before {} random steps could be performed at the start. You should probably lower the `nb_max_start_steps` parameter.' .format(nb_random_start_steps)) observation = deepcopy(env.reset()) if self.processor is not None: observation = self.processor.process_observation( observation) break # Run the episode until we're done. done = False while not done: callbacks.on_step_begin(episode_step) action = self.forward(observation) if self.processor is not None: action = self.processor.process_action(action) reward = 0. accumulated_info = {} for _ in range(action_repetition): callbacks.on_action_begin(action) observation, r, d, info = env.step(action) observation = deepcopy(observation) if self.processor is not None: observation, r, d, info = self.processor.process_step( observation, r, d, info) callbacks.on_action_end(action) reward += r for key, value in info.items(): if not np.isreal(value): continue if key not in accumulated_info: accumulated_info[key] = np.zeros_like(value) accumulated_info[key] += value if d: done = True break if nb_max_episode_steps and episode_step >= nb_max_episode_steps - 1: done = True self.backward(reward, terminal=done) episode_reward += reward step_logs = { 'action': action, 'observation': observation, 'reward': reward, 'episode': episode, 'info': accumulated_info, } callbacks.on_step_end(episode_step, step_logs) episode_step += 1 self.step += 1 # We are in a terminal state but the agent hasn't yet seen it. We therefore # perform one more forward-backward call and simply ignore the action before # resetting the environment. We need to pass in `terminal=False` here since # the *next* state, that is the state of the newly reset environment, is # always non-terminal by convention. self.forward(observation) self.backward(0., terminal=False) # Report end of episode. episode_logs = { 'episode_reward': episode_reward, 'nb_steps': episode_step, } callbacks.on_episode_end(episode, episode_logs) callbacks.on_train_end() self._on_test_end() return history
def test(self, env, nb_episodes=1, action_repetition=1, callbacks=None, visualize=True, nb_max_episode_steps=None, nb_max_start_steps=0, start_step_policy=None, verbose=1): """Callback that is called before training begins." """ if not self.compiled: raise RuntimeError('Your tried to test your agent but it hasn\'t been compiled yet. Please call `compile()` before `test()`.') if action_repetition < 1: raise ValueError('action_repetition must be >= 1, is {}'.format(action_repetition)) self.training = False self.step = 0 callbacks = [] if not callbacks else callbacks[:] if verbose >= 1: callbacks += [TestLogger()] if visualize: callbacks += [Visualizer()] history = History() callbacks += [history] callbacks = CallbackList(callbacks) if hasattr(callbacks, 'set_model'): callbacks.set_model(self) else: callbacks._set_model(self) callbacks._set_env(env) params = { 'nb_episodes': nb_episodes, } if hasattr(callbacks, 'set_params'): callbacks.set_params(params) else: callbacks._set_params(params) self._on_test_begin() callbacks.on_train_begin() for episode in range(nb_episodes): callbacks.on_episode_begin(episode) episode_reward = 0. episode_step = 0 # Obtain the initial observation by resetting the environment. self.reset_states() observation = deepcopy(env.reset()) if self.processor is not None: observation = self.processor.process_observation(observation) assert observation is not None # Perform random starts at beginning of episode and do not record them into the experience. # This slightly changes the start position between games. nb_random_start_steps = 0 if nb_max_start_steps == 0 else np.random.randint(nb_max_start_steps) for _ in range(nb_random_start_steps): if start_step_policy is None: action = env.action_space.sample() else: action = start_step_policy(observation) if self.processor is not None: action = self.processor.process_action(action) callbacks.on_action_begin(action) observation, r, done, info = env.step(action) observation = deepcopy(observation) if self.processor is not None: observation, r, done, info = self.processor.process_step(observation, r, done, info) callbacks.on_action_end(action) if done: warnings.warn('Env ended before {} random steps could be performed at the start. You should probably lower the `nb_max_start_steps` parameter.'.format(nb_random_start_steps)) observation = deepcopy(env.reset()) if self.processor is not None: observation = self.processor.process_observation(observation) break # Run the episode until we're done. done = False while not done: callbacks.on_step_begin(episode_step) action = self.forward(observation) if self.processor is not None: action = self.processor.process_action(action) reward = 0. accumulated_info = {} for _ in range(action_repetition): callbacks.on_action_begin(action) observation, r, d, info = env.step(action) observation = deepcopy(observation) if self.processor is not None: observation, r, d, info = self.processor.process_step(observation, r, d, info) callbacks.on_action_end(action) reward += r for key, value in info.items(): if not np.isreal(value): continue if key not in accumulated_info: accumulated_info[key] = np.zeros_like(value) accumulated_info[key] += value if d: done = True break if nb_max_episode_steps and episode_step >= nb_max_episode_steps - 1: done = True self.backward(reward, terminal=done) episode_reward += reward step_logs = { 'action': action, 'observation': observation, 'reward': reward, 'episode': episode, 'info': accumulated_info, } callbacks.on_step_end(episode_step, step_logs) episode_step += 1 self.step += 1 # We are in a terminal state but the agent hasn't yet seen it. We therefore # perform one more forward-backward call and simply ignore the action before # resetting the environment. We need to pass in `terminal=False` here since # the *next* state, that is the state of the newly reset environment, is # always non-terminal by convention. self.forward(observation) self.backward(0., terminal=False) # Report end of episode. episode_logs = { 'episode_reward': episode_reward, 'nb_steps': episode_step, } callbacks.on_episode_end(episode, episode_logs) callbacks.on_train_end() self._on_test_end() return history
def fit(self, env, nb_steps, action_repetition=1, callbacks=None, verbose=1, visualize=False, nb_max_start_steps=0, start_step_policy=None, log_interval=10000, nb_max_episode_steps=None): """Trains the agent on the given environment. # Arguments env: (`Env` instance): Environment that the agent interacts with. See [Env](#env) for details. nb_steps (integer): Number of training steps to be performed. action_repetition (integer): Number of times the agent repeats the same action without observing the environment again. Setting this to a value > 1 can be useful if a single action only has a very small effect on the environment. callbacks (list of `keras.callbacks.Callback` or `rl.callbacks.Callback` instances): List of callbacks to apply during training. See [callbacks](/callbacks) for details. verbose (integer): 0 for no logging, 1 for interval logging (compare `log_interval`), 2 for episode logging visualize (boolean): If `True`, the environment is visualized during training. However, this is likely going to slow down training significantly and is thus intended to be a debugging instrument. nb_max_start_steps (integer): Number of maximum steps that the agent performs at the beginning of each episode using `start_step_policy`. Notice that this is an upper limit since the exact number of steps to be performed is sampled uniformly from [0, max_start_steps] at the beginning of each episode. start_step_policy (`lambda observation: action`): The policy to follow if `nb_max_start_steps` > 0. If set to `None`, a random action is performed. log_interval (integer): If `verbose` = 1, the number of steps that are considered to be an interval. nb_max_episode_steps (integer): Number of steps per episode that the agent performs before automatically resetting the environment. Set to `None` if each episode should run (potentially indefinitely) until the environment signals a terminal state. # Returns A `keras.callbacks.History` instance that recorded the entire training process. """ if not self.compiled: raise RuntimeError('Your tried to fit your agent but it hasn\'t been compiled yet. Please call `compile()` before `fit()`.') if action_repetition < 1: raise ValueError('action_repetition must be >= 1, is {}'.format(action_repetition)) self.training = True callbacks = [] if not callbacks else callbacks[:] if verbose == 1: callbacks += [TrainIntervalLogger(interval=log_interval)] elif verbose > 1: callbacks += [TrainEpisodeLogger()] if visualize: callbacks += [Visualizer()] history = History() callbacks += [history] callbacks = CallbackList(callbacks) if hasattr(callbacks, 'set_model'): callbacks.set_model(self) else: callbacks._set_model(self) callbacks._set_env(env) params = { 'nb_steps': nb_steps, } if hasattr(callbacks, 'set_params'): callbacks.set_params(params) else: callbacks._set_params(params) self._on_train_begin() callbacks.on_train_begin() episode = np.int16(0) self.step = np.int16(0) observation = None episode_reward = None self.episode_step = None did_abort = False try: while self.step < nb_steps: self.training = True if observation is None: # start of a new episode callbacks.on_episode_begin(episode) self.episode_step = np.int16(0) episode_reward = np.float32(0) # Obtain the initial observation by resetting the environment. self.reset_states() observation = deepcopy(env.reset()) if self.processor is not None: observation = self.processor.process_observation(observation) assert observation is not None # Perform random starts at beginning of episode and do not record them into the experience. # This slightly changes the start position between games. nb_random_start_steps = 0 if nb_max_start_steps == 0 else np.random.randint(nb_max_start_steps) for _ in range(nb_random_start_steps): if start_step_policy is None: action = env.action_space.sample() else: action = start_step_policy(observation) if self.processor is not None: action = self.processor.process_action(action) callbacks.on_action_begin(action) observation, reward, done, info = env.step(action) observation = deepcopy(observation) if self.processor is not None: observation, reward, done, info = self.processor.process_step(observation, reward, done, info) callbacks.on_action_end(action) if done: warnings.warn('Env ended before {} random steps could be performed at the start. You should probably lower the `nb_max_start_steps` parameter.'.format(nb_random_start_steps)) observation = deepcopy(env.reset()) if self.processor is not None: observation = self.processor.process_observation(observation) break # At this point, we expect to be fully initialized. assert episode_reward is not None assert self.episode_step is not None assert observation is not None # Run a single step. callbacks.on_step_begin(self.episode_step) # This is were all of the work happens. We first perceive and compute the action # (forward step) and then use the reward to improve (backward step). action = self.forward(observation) if self.processor is not None: action = self.processor.process_action(action) reward = np.float32(0) accumulated_info = {} done = False for _ in range(action_repetition): callbacks.on_action_begin(action) observation, r, done, info = env.step(action) observation = deepcopy(observation) if self.processor is not None: observation, r, done, info = self.processor.process_step(observation, r, done, info) for key, value in info.items(): if not np.isreal(value): continue if key not in accumulated_info: accumulated_info[key] = np.zeros_like(value) accumulated_info[key] += value callbacks.on_action_end(action) reward += r if done: break if nb_max_episode_steps and self.episode_step >= nb_max_episode_steps - 1: # Force a terminal state. done = True metrics = self.backward(reward, terminal=done) episode_reward += reward # AHA last_distance = info['distance'] step_logs = { 'action': action, 'observation': observation, 'reward': reward, 'metrics': metrics, 'episode': episode, 'info': accumulated_info, } callbacks.on_step_end(self.episode_step, step_logs) self.episode_step += 1 self.step += 1 if done: # We are in a terminal state but the agent hasn't yet seen it. We therefore # perform one more forward-backward call and simply ignore the action before # resetting the environment. We need to pass in `terminal=False` here since # the *next* state, that is the state of the newly reset environment, is # always non-terminal by convention. self.forward(observation) self.backward(0., terminal=False) # AHA # Conflict becuase test sets step=0 # test_history = self.test(env, nb_episodes=10, nb_max_episode_steps=4) # This episode is finished, report and reset. episode_logs = { 'episode_reward': episode_reward, 'nb_episode_steps': self.episode_step, 'nb_steps': self.step, 'last_train_distance': last_distance, # 'last_test_distance': np.mean(test_history.history['last_distance']) } callbacks.on_episode_end(episode, episode_logs) episode += 1 observation = None self.episode_step = None episode_reward = None except KeyboardInterrupt: # We catch keyboard interrupts here so that training can be be safely aborted. # This is so common that we've built this right into this function, which ensures that # the `on_train_end` method is properly called. did_abort = True callbacks.on_train_end(logs={'did_abort': did_abort}) self._on_train_end() return history
def test(self, env, nb_episodes=1, action_repetition=1, callbacks=None, visualize=False, nb_max_episode_steps=None, nb_max_start_steps=0, start_step_policy=None, verbose=1): """Callback that is called before training begins." """ if not self.compiled: raise RuntimeError('Your tried to test your agent but it hasn\'t been compiled yet. Please call `compile()` before `test()`.') if action_repetition < 1: raise ValueError('action_repetition must be >= 1, is {}'.format(action_repetition)) self.training = False self.step = 0 callbacks = [] if not callbacks else callbacks[:] if verbose >= 1: callbacks += [TestLogger()] if visualize: callbacks += [Visualizer()] history = History() callbacks += [history] callbacks = CallbackList(callbacks) if hasattr(callbacks, 'set_model'): callbacks.set_model(self) else: callbacks._set_model(self) callbacks._set_env(env) params = { 'nb_episodes': nb_episodes, } if hasattr(callbacks, 'set_params'): callbacks.set_params(params) else: callbacks._set_params(params) self._on_test_begin() callbacks.on_train_begin() for episode in range(nb_episodes): callbacks.on_episode_begin(episode) episode_reward = 0. episode_step = 0 # Obtain the initial observation by resetting the environment. self.reset_states() observation = deepcopy(env.reset()) if self.processor is not None: observation = self.processor.process_observation(observation) assert observation is not None # Perform random starts at beginning of episode and do not record them into the experience. # This slightly changes the start position between games. nb_random_start_steps = 0 if nb_max_start_steps == 0 else np.random.randint(nb_max_start_steps) for _ in range(nb_random_start_steps): if start_step_policy is None: action = env.action_space.sample() else: action = start_step_policy(observation) if self.processor is not None: action = self.processor.process_action(action) callbacks.on_action_begin(action) observation, r, done, info = env.step(action) observation = deepcopy(observation) if self.processor is not None: observation, r, done, info = self.processor.process_step(observation, r, done, info) callbacks.on_action_end(action) if done: warnings.warn('Env ended before {} random steps could be performed at the start. You should probably lower the `nb_max_start_steps` parameter.'.format(nb_random_start_steps)) observation = deepcopy(env.reset()) if self.processor is not None: observation = self.processor.process_observation(observation) break # Run the episode until we're done. done = False while not done: callbacks.on_step_begin(episode_step) action = self.forward(observation) if self.processor is not None: action = self.processor.process_action(action) reward = 0. accumulated_info = {} for _ in range(action_repetition): callbacks.on_action_begin(action) observation, r, d, info = env.step(action) observation = deepcopy(observation) if self.processor is not None: observation, r, d, info = self.processor.process_step(observation, r, d, info) callbacks.on_action_end(action) reward += pd.to_numeric(r) for key, value in info.items(): if not np.isreal(value): continue if key not in accumulated_info: accumulated_info[key] = np.zeros_like(value) accumulated_info[key] += value if d: done = True break if nb_max_episode_steps and episode_step >= nb_max_episode_steps - 1: done = True self.backward(reward, terminal=done) episode_reward += reward step_logs = { 'action': action, 'observation': observation, 'reward': reward, 'episode': episode, 'info': accumulated_info, } callbacks.on_step_end(episode_step, step_logs) episode_step += 1 self.step += 1 # We are in a terminal state but the agent hasn't yet seen it. We therefore # perform one more forward-backward call and simply ignore the action before # resetting the environment. We need to pass in `terminal=False` here since # the *next* state, that is the state of the newly reset environment, is # always non-terminal by convention. self.forward(observation) self.backward(0., terminal=False) # Report end of episode. episode_logs = { 'episode_reward': episode_reward, 'nb_steps': episode_step, } callbacks.on_episode_end(episode, episode_logs) callbacks.on_train_end() self._on_test_end() return history
def fit(self, env, nb_steps, action_repetition=1, callbacks=None, verbose=1, visualize=False, nb_max_start_steps=0, start_step_policy=None, log_interval=10000, nb_max_episode_steps=None): for dqagent in self.dqagents: if not dqagent.compiled: raise RuntimeError( 'Your tried to fit your agents but one hasn\'t been compiled yet. Please call `compile()` before `fit()`.' ) if action_repetition < 1: raise ValueError( 'action_repetition must be >= 1, is {}'.format( action_repetition)) self.dqagents[0].training = True self.dqagents[1].training = True callbacks = [] if not callbacks else callbacks[:] if verbose == 1: callbacks += [TrainIntervalLogger(interval=log_interval)] elif verbose > 1: callbacks += [TrainEpisodeLogger()] if visualize: callbacks += [Visualizer()] history = History() callbacks += [history] callbacks = CallbackList(callbacks) if hasattr(callbacks, 'set_model'): callbacks.set_model(self) else: callbacks._set_model(self) callbacks._set_env(env) params = { 'nb_steps': nb_steps, } if hasattr(callbacks, 'set_params'): callbacks.set_params(params) else: callbacks._set_params(params) self._on_train_begin() #callbacks.on_train_begin() episode = np.int16(0) self.step = np.int16(0) observations = [] episode_reward = None episode_step = None did_abort = False try: while self.step < nb_steps: # check if observations is empty if observations == []: # start of a new episode callbacks.on_episode_begin(episode) episode_step = np.int16(0) episode_reward = np.float32([0, 0]) # Obtain the initial observation by resetting the environment. self.dqagents[0].reset_states() self.dqagents[1].reset_states() observations = deepcopy(env.reset()) if self.processor is not None: # process all observations observations = [ self.processor.process_observation(observation) for observation in observations ] assert observations != [] # Perform random starts at beginning of episode and do not record them into the experience. # This slightly changes the start position between games. # can remove this bit, not gonna use any random starts nb_random_start_steps = 0 if nb_max_start_steps == 0 else np.random.randint( nb_max_start_steps) for _ in range(nb_random_start_steps): if start_step_policy is None: actions = env.action_space.sample() else: actions = start_step_policy(observation) if self.processor is not None: actions = self.processor.process_action(action) callbacks.on_action_begin(action) observations, rewards, done, info = env.step(action) observations = deepcopy(observations) if self.processor is not None: observations, rewards, done, info = self.processor.process_step( observations, rewards, done, info) callbacks.on_action_end(action) if done: warnings.warn( 'Env ended before {} random steps could be performed at the start. You should probably lower the `nb_max_start_steps` parameter.' .format(nb_random_start_steps)) observations = deepcopy(env.reset()) if self.processor is not None: observations = [ self.processor.process_observation( observation) for observation in observations ] break # At this point, we expect to be fully initialized. assert episode_reward is not None assert episode_step is not None assert observations != [] # Run a single step. callbacks.on_step_begin(episode_step) # This is were all of the work happens. We first perceive and compute the action # (forward step) and then use the reward to improve (backward step). # given incides [0,3] are hider indices and [4,5] are seeker indices actions = [] for i in range(2, 6): actions.append(self.dqagents[0].forward(observations[i])) for i in range(0, 2): actions.append(self.dqagents[1].forward(observations[i])) # process all actions if self.processor is not None: actions = [ self.processor.process_action(action) for action in actions ] rewards = np.float32([0, 0]) hider_reward = np.float32(0) seeker_reward = np.float32(0) accumulated_info = {} done = False for _ in range(action_repetition): callbacks.on_action_begin(actions) # expect rs[0] to be aggregate hider reward, rs[1] aggregate seeker reward observations, rs, done, info = env.step(actions) observations = deepcopy(observations) if self.processor is not None: observations, rs, done, info = self.processor.process_step( observations, rs, done, info) for key, value in info.items(): if not np.isreal(value): continue if key not in accumulated_info: accumulated_info[key] = np.zeros_like(value) accumulated_info[key] += value callbacks.on_action_end(actions) hider_reward += rs[0] seeker_reward += rs[1] rewards += rs if done: break if nb_max_episode_steps and episode_step >= nb_max_episode_steps - 1: # Force a terminal state. done = True # run backwrd step wrt each agent's respective aggregate reward hider_metrics = self.dqagents[0].backward(hider_reward, terminal=done) seeker_metrics = self.dqagents[1].backward(seeker_reward, terminal=done) episode_reward += rewards step_logs = { 'actions': actions, 'observations': observations, 'hider_reward': hider_reward, 'hider_metrics': hider_metrics, 'seeker_reward': seeker_reward, 'metrics': seeker_metrics, 'reward': seeker_reward, 'episode': episode, 'info': accumulated_info, } callbacks.on_step_end(episode_step, step_logs) episode_step += 1 self.step += 1 self.dqagents[0].step += 1 self.dqagents[1].step += 1 if done: # We are in a terminal state but the agent hasn't yet seen it. We therefore # perform one more forward-backward call and simply ignore the action before # resetting the environment. We need to pass in `terminal=False` here since # the *next* state, that is the state of the newly reset environment, is # always non-terminal by convention. for i in range(2, 6): self.dqagents[0].forward(observations[i]) for i in range(0, 2): self.dqagents[1].forward(observations[i]) self.dqagents[0].backward(0., terminal=False) self.dqagents[1].backward(0., terminal=False) # This episode is finished, report and reset. episode_logs = { 'episode_reward': episode_reward, 'nb_episode_steps': episode_step, 'nb_steps': self.step, 'epoch': 1 } # callbacks.on_episode_end(episode, episode_logs) episode += 1 observations = [] episode_step = None episode_reward = None except KeyboardInterrupt: # We catch keyboard interrupts here so that training can be be safely aborted. # This is so common that we've built this right into this function, which ensures that # the `on_train_end` method is properly called. did_abort = True # callbacks.on_train_end(logs={'did_abort': did_abort}) self._on_train_end() return history
def fit(self, env, nb_steps, action_repetition=1, callbacks=None, verbose=1, visualize=False, nb_max_start_steps=0, start_step_policy=None, log_interval=10000, nb_max_episode_steps=None,stepper=False): if not self.compiled: raise RuntimeError('Your tried to fit your agent but it hasn\'t been compiled yet. Please call `compile()` before `fit()`.') if action_repetition < 1: raise ValueError('action_repetition must be >= 1, is {}'.format(action_repetition)) self.training = True self.stepper = stepper callbacks = [] if not callbacks else callbacks[:] if verbose == 1: callbacks += [TrainIntervalLogger(interval=log_interval)] elif verbose > 1: callbacks += [TrainEpisodeLogger()] if visualize: callbacks += [Visualizer()] history = History() callbacks += [history] callbacks = CallbackList(callbacks) if hasattr(callbacks, 'set_model'): callbacks.set_model(self) else: callbacks._set_model(self) callbacks._set_env(env) params = { 'nb_steps': nb_steps, } if hasattr(callbacks, 'set_params'): callbacks.set_params(params) else: callbacks._set_params(params) self._on_train_begin() callbacks.on_train_begin() episode = 0 self.step = 0 observation = None episode_reward = None episode_step = None did_abort = False try: while self.step < nb_steps: penalty = 0 if observation is None: # start of a new episode callbacks.on_episode_begin(episode) episode_step = 0 episode_reward = 0. # Obtain the initial observation by resetting the environment. self.reset_states() observation = deepcopy(env.reset()) if self.processor is not None: observation = self.processor.process_observation(observation) assert observation is not None # Perform random starts at beginning of episode and do not record them into the experience. # This slightly changes the start position between games. nb_random_start_steps = 0 if nb_max_start_steps == 0 else np.random.randint(nb_max_start_steps) for _ in range(nb_random_start_steps): if self.manual: action = int(raw_input("action?\n")) elif start_step_policy is None: action = env.action_space.sample() else: action = start_step_policy(observation) if self.shield is not None: if self.maze: inp = get_input_maze(observation) else: inp = get_input(observation) action_bin = to_bin(action) if not self.huge_neg: action = self.shield(inp[0],inp[1],inp[2],action_bin[0],action_bin[1],action_bin[2]) elif self.huge_neg: if to_int(self.shield(inp[0],inp[1],inp[2],action_bin[0],action_bin[1],action_bin[2])) != action: penalty = -10 if self.processor is not None: action = self.processor.process_action(action) callbacks.on_action_begin(action) if self.stepper: action = int(raw_input("action?\n")) observation, reward, done, info = env.step(action) observation = deepcopy(observation) if self.processor is not None: observation, reward, done, info = self.processor.process_step(observation, reward, done, info) callbacks.on_action_end(action) if done: warnings.warn('Env ended before {} random steps could be performed at the start. You should probably lower the `nb_max_start_steps` parameter.'.format(nb_random_start_steps)) observation = deepcopy(env.reset()) if self.processor is not None: observation = self.processor.process_observation(observation) break # At this point, we expect to be fully initialized. assert episode_reward is not None assert episode_step is not None assert observation is not None # Run a single step. callbacks.on_step_begin(episode_step) # This is were all of the work happens. We first perceive and compute the action # (forward step) and then use the reward to improve (backward step). #print observation if self.manual: oldaction = self.forward(observation, manual=True) elif self.preemptive: banned_actions = [] inp = get_input(observation) for an_action in range(0,8): an_action_bin = to_bin(an_action) action = to_int(self.shield.move(inp[0],inp[1],inp[2],inp[3],an_action_bin[0],an_action_bin[1],an_action_bin[2])) if action != an_action: banned_actions.append(an_action) oldaction = self.forward(observation,manual=False,banned_actions=banned_actions) else: oldaction = self.forward(observation,manual=False) # print oldaction if self.shield is not None: if self.maze: inp = get_input_maze(observation) else: inp = get_input(observation) action_bin = to_bin(oldaction) #sleep(0.01) if self.preemptive: action = oldaction elif not self.huge_neg: action = to_int(self.shield.move(inp[0],inp[1],inp[2],inp[3],action_bin[0],action_bin[1],action_bin[2])) elif self.huge_neg: if to_int(self.shield(inp[0],inp[1],inp[2],action_bin[0],action_bin[1],action_bin[2])) != action: penalty = -10 action = oldaction else: action = oldaction #print action, oldaction if self.processor is not None: action = self.processor.process_action(action) reward = 0. accumulated_info = {} done = False for _ in range(action_repetition): callbacks.on_action_begin(action) observation, r, done, info = env.step(action) observation = deepcopy(observation) if self.processor is not None: observation, r, done, info = self.processor.process_step(observation, r+penalty, done, info) for key, value in info.items(): if not np.isreal(value): continue if key not in accumulated_info: accumulated_info[key] = np.zeros_like(value) accumulated_info[key] += value callbacks.on_action_end(action) reward += r+penalty if done: break if nb_max_episode_steps and episode_step >= nb_max_episode_steps - 1: # Force a terminal state. done = True metrics = self.backward(reward, terminal=done) episode_reward += reward step_logs = { 'action': action, 'observation': observation, 'reward': reward, 'metrics': metrics, 'episode': episode, 'info': accumulated_info, } oldstep_logs = { 'action': oldaction, 'observation': observation, 'reward': -1, 'metrics': metrics, 'episode': episode, 'info': accumulated_info, } # if correction: # callbacks.on_step_end(episode_step, oldstep_logs) # episode_step += 1 # self.step += 1 callbacks.on_step_end(episode_step, step_logs) episode_step += 1 self.step += 1 if done: # We are in a terminal state but the agent hasn't yet seen it. We therefore # perform one more forward-backward call and simply ignore the action before # resetting the environment. We need to pass in `terminal=False` here since # the *next* state, that is the state of the newly reset environment, is # always non-terminal by convention. self.forward(observation) self.backward(0., terminal=False) # This episode is finished, report and reset. episode_logs = { 'episode_reward': episode_reward, 'nb_episode_steps': episode_step, 'nb_steps': self.step, } callbacks.on_episode_end(episode, episode_logs) episode += 1 observation = None episode_step = None episode_reward = None except KeyboardInterrupt: # We catch keyboard interrupts here so that training can be be safely aborted. # This is so common that we've built this right into this function, which ensures that # the `on_train_end` method is properly called. did_abort = True callbacks.on_train_end(logs={'did_abort': did_abort}) self._on_train_end() return history
def fit(self, env, nb_steps, action_repetition=1, callbacks=None, verbose=1, visualize=False, nb_max_start_steps=0, start_step_policy=None, log_interval=10000, nb_max_episode_steps=None, episode_averaging_length=10, success_threshold=None, stopping_patience=None, min_nb_steps=500, single_cycle=True): """Trains the agent on the given environment. # Arguments env: (`Env` instance): Environment that the agent interacts with. See [Env](#env) for details. nb_steps (integer): Number of training steps to be performed. action_repetition (integer): Number of times the agent repeats the same action without observing the environment again. Setting this to a value > 1 can be useful if a single action only has a very small effect on the environment. callbacks (list of `keras.callbacks.Callback` or `rl.callbacks.Callback` instances): List of callbacks to apply during training. See [callbacks](/callbacks) for details. verbose (integer): 0 for no logging, 1 for interval logging (compare `log_interval`), 2 for episode logging visualize (boolean): If `True`, the environment is visualized during training. However, this is likely going to slow down training significantly and is thus intended to be a debugging instrument. nb_max_start_steps (integer): Number of maximum steps that the agent performs at the beginning of each episode using `start_step_policy`. Notice that this is an upper limit since the exact number of steps to be performed is sampled uniformly from [0, max_start_steps] at the beginning of each episode. start_step_policy (`lambda observation: action`): The policy to follow if `nb_max_start_steps` > 0. If set to `None`, a random action is performed. log_interval (integer): If `verbose` = 1, the number of steps that are considered to be an interval. nb_max_episode_steps (integer): Number of steps per episode that the agent performs before automatically resetting the environment. Set to `None` if each episode should run (potentially indefinitely) until the environment signals a terminal state. # Returns A `keras.callbacks.History` instance that recorded the entire training process. """ if not self.compiled: raise RuntimeError( 'Your tried to fit your agent but it hasn\'t been compiled yet. Please call `compile()` before `fit()`.' ) if action_repetition < 1: raise ValueError('action_repetition must be >= 1, is {}'.format( action_repetition)) self.training = True callbacks = [] if not callbacks else callbacks[:] for cb in callbacks: if isinstance(cb, FileLogger): save_path = cb.filepath folder_index = save_path.index("training_history.json") weights_file = os.path.join(save_path[:folder_index], "dqn_weights.h5f") if verbose == 1: callbacks += [TrainIntervalLogger(interval=log_interval)] elif verbose > 1: callbacks += [TrainEpisodeLogger(interval=log_interval)] if visualize: callbacks += [Visualizer()] history = History() callbacks += [history] callbacks = CallbackList(callbacks) if hasattr(callbacks, 'set_model'): callbacks.set_model(self) else: callbacks._set_model(self) callbacks._set_env(env) params = { 'nb_steps': nb_steps, } if hasattr(callbacks, 'set_params'): callbacks.set_params(params) else: callbacks._set_params(params) self._on_train_begin() callbacks.on_train_begin() episode = np.int16(0) self.step = np.int16(0) observation = None episode_reward = None episode_step = None episode_num_errors = None did_abort = False # ------ Early stopping and reporting averages ------------------ # # It would be ideal to do this via a callback, but returning flags from callbacks seems tricky. Eish! # So, we automatically include early stopping here in the fit method. # NB: We have hardcoded in something which is probably not ideal to hard code, but I just want it # to work, and can fix things and make them nicer/more flexible at a later stage! # # -------------------------------------------------------------- if not single_cycle: recent_episode_lifetimes = deque([], episode_averaging_length) episode_lifetimes_rolling_avg = 0 best_rolling_avg = 0 best_episode = 0 time_since_best = 0 elif single_cycle: recent_episode_wins = deque([], episode_averaging_length) best_rolling_avg = 0 best_episode = 0 time_since_best = 0 rolling_win_fraction = 0 stop_training = False has_succeeded = False stopped_improving = False try: while self.step < nb_steps and not stop_training: if observation is None: # start of a new episode callbacks.on_episode_begin(episode) episode_step = np.int16(0) episode_reward = np.float32(0) # Obtain the initial observation by resetting the environment. self.reset_states() observation = deepcopy(env.reset()) # print("Episode Step:", episode_step) # print("hidden state: ") # print(env.hidden_state) # print("Board State: ") # print(observation) if self.processor is not None: observation = self.processor.process_observation( observation) assert observation is not None # Perform random starts at beginning of episode and do not record them into the experience. # This slightly changes the start position between games. nb_random_start_steps = 0 if nb_max_start_steps == 0 else np.random.randint( nb_max_start_steps) for _ in range(nb_random_start_steps): if start_step_policy is None: action = env.action_space.sample() else: action = start_step_policy(observation) if self.processor is not None: action = self.processor.process_action(action) callbacks.on_action_begin(action) observation, reward, done, info = env.step(action) observation = deepcopy(observation) if self.processor is not None: observation, reward, done, info = self.processor.process_step( observation, reward, done, info) callbacks.on_action_end(action) if done: warnings.warn( 'Env ended before {} random steps could be performed at the start. You should probably lower the `nb_max_start_steps` parameter.' .format(nb_random_start_steps)) observation = deepcopy(env.reset()) if self.processor is not None: observation = self.processor.process_observation( observation) break # At this point, we expect to be fully initialized. assert episode_reward is not None assert episode_step is not None assert observation is not None # print("Episode Step:", episode_step) # Run a single step. callbacks.on_step_begin(episode_step) # This is were all of the work happens. We first perceive and compute the action # (forward step) and then use the reward to improve (backward step). if hasattr(env, "legal_actions"): legal_actions = list(env.legal_actions) action = self.forward(observation, legal_actions) # print("legal actions: ", legal_actions) # print("chosen action: ", action) else: action = self.forward(observation) if self.processor is not None: action = self.processor.process_action(action) reward = np.float32(0) accumulated_info = {} done = False for _ in range(action_repetition): callbacks.on_action_begin(action) observation, r, done, info = env.step(action) observation = deepcopy(observation) if self.processor is not None: observation, r, done, info = self.processor.process_step( observation, r, done, info) for key, value in info.items(): if not np.isreal(value): continue if key not in accumulated_info: accumulated_info[key] = np.zeros_like(value) accumulated_info[key] += value callbacks.on_action_end(action) reward += r if done: break if nb_max_episode_steps and episode_step >= nb_max_episode_steps - 1: # Force a terminal state. done = True metrics = self.backward(reward, terminal=done) episode_reward += reward # print("new hidden state: ") # print(env.hidden_state) # print("new board state: ") # print(observation) # print("reward: ", r, "episode reward: ", episode_reward) # print("done: ", done) step_logs = { 'action': action, 'observation': observation, 'reward': reward, 'metrics': metrics, 'episode': episode, 'info': accumulated_info, } callbacks.on_step_end(episode_step, step_logs) episode_step += 1 self.step += 1 if done: # We are in a terminal state but the agent hasn't yet seen it. We therefore # perform one more forward-backward call and simply ignore the action before # resetting the environment. We need to pass in `terminal=False` here since # the *next* state, that is the state of the newly reset environment, is # always non-terminal by convention. action = self.forward(observation) self.backward(0., terminal=False) # Now we want to work out the recent averages, this will go into early stopping if not single_cycle: recent_episode_lifetimes.append(env.lifetime) episode_lifetimes_rolling_avg = np.mean( recent_episode_lifetimes) if episode_lifetimes_rolling_avg > best_rolling_avg: best_rolling_avg = episode_lifetimes_rolling_avg best_episode = episode time_since_best = 0 else: time_since_best = episode - best_episode if episode_lifetimes_rolling_avg > success_threshold: stop_training = True has_succeeded = True if self.step > min_nb_steps and time_since_best > stopping_patience: stop_training = True stopped_improving = True else: if episode_reward == 1: recent_episode_wins.append(1) else: recent_episode_wins.append(0) num_wins = np.sum(recent_episode_wins) rolling_win_fraction = num_wins / episode_averaging_length if rolling_win_fraction > best_rolling_avg: best_rolling_avg = rolling_win_fraction best_episode = episode time_since_best = 0 # Here I need to add something to save the net - I'm worried this will make things really slow while its improving, because it will be saving every time # For a long time. Eish! if self.step > min_nb_steps: self.save_weights(weights_file, overwrite=True) else: time_since_best = episode - best_episode if rolling_win_fraction > success_threshold: stop_training = True has_succeeded = True if self.step > min_nb_steps and time_since_best > stopping_patience: stop_training = True stopped_improving = True # This episode is finished, report and reset. if not single_cycle: episode_logs = { 'episode_reward': episode_reward, 'nb_episode_steps': episode_step, 'nb_steps': self.step, 'episode_lifetimes_rolling_avg': episode_lifetimes_rolling_avg, 'best_rolling_avg': best_rolling_avg, 'best_episode': best_episode, 'time_since_best': time_since_best, 'has_succeeded': has_succeeded, 'stopped_improving': stopped_improving } else: episode_logs = { 'episode_reward': episode_reward, 'nb_episode_steps': episode_step, 'nb_steps': self.step, 'rolling_win_fraction': rolling_win_fraction, 'best_rolling_fraction': best_rolling_avg, 'best_episode': best_episode, 'time_since_best': time_since_best, 'has_succeeded': has_succeeded, 'stopped_improving': stopped_improving } callbacks.on_episode_end(episode, episode_logs, single_cycle) episode += 1 observation = None episode_step = None episode_reward = None except KeyboardInterrupt: # We catch keyboard interrupts here so that training can be be safely aborted. # This is so common that we've built this right into this function, which ensures that # the `on_train_end` method is properly called. did_abort = True if not single_cycle: callbacks.on_train_end(logs={ 'did_abort': did_abort, 'has_succeeded': has_succeeded, 'stopped_improving': stopped_improving, 'episode_lifetimes_rolling_avg': episode_lifetimes_rolling_avg, 'step': self.step }, single_cycle=single_cycle) else: callbacks.on_train_end(logs={ 'did_abort': did_abort, 'has_succeeded': has_succeeded, 'stopped_improving': stopped_improving, 'rolling_win_fraction': rolling_win_fraction, 'step': self.step }, single_cycle=single_cycle) self._on_train_end() return history