Exemplos de DDPGAgent.forward em Python

Linguagem de programação: Python

Espaço para nome / nome do pacote: rl.agents.ddpg

Classe / Tipo: DDPGAgent

Método / Função: forward

Exemplos em hotexamples.com: 2

DDPGAgent.forward em Python - 2 exemplos encontrados. Esses são os exemplos do mundo real mais bem avaliados de rl.agents.ddpg.DDPGAgent.forward em Python extraídos de projetos de código aberto. Você pode avaliar os exemplos para nos ajudar a melhorar a qualidade deles.

Métodos Frequentes

Exibir Ocultar

compile(14)

DDPGAgent(11)

fit(9)

save_weights(6)

test(6)

load_weights(5)

train(3)

_on_train_begin(2)

_on_train_end(2)

backward(2)

forward(2)

reset_states(2)

Métodos Frequentes

compile (14)

DDPGAgent (11)

fit (9)

save_weights (6)

test (6)

load_weights (5)

train (3)

_on_train_begin (2)

_on_train_end (2)

backward (2)

Métodos Frequentes

forward (2)

reset_states (2)

Exemplo n.º 1

0

Exibir arquivo

class CoopActionOtherDDPG(Agent): # Two Agents, who can measure the output of the other (Based on Keras-rl agent impl.) def forward(self, observation): raise NotImplementedError def backward(self, reward, terminal): raise NotImplementedError def load_weights(self, filepath): raise NotImplementedError def save_weights(self, filepath, overwrite=False): raise NotImplementedError @property def layers(self): raise NotImplementedError def __init__(self, nb_actions, actor1, actor2, critic1, critic2, critic_action_input1, critic_action_input2, memory1, memory2, gamma=.99, batch_size=32, nb_steps_warmup_critic=1000, nb_steps_warmup_actor=1000, train_interval=1, memory_interval=1, delta_range=None, delta_clip=np.inf, random_process1=None, random_process2=None, custom_model_objects={}, target_model_update=.001, **kwargs): super(CoopActionOtherDDPG, self).__init__() self.agent1 = DDPGAgent(nb_actions, actor1, critic1, critic_action_input1, memory1, gamma, batch_size, nb_steps_warmup_critic, nb_steps_warmup_actor, train_interval, memory_interval, delta_range, delta_clip, random_process1, custom_model_objects, target_model_update, **kwargs) self.agent2 = DDPGAgent(nb_actions, actor2, critic2, critic_action_input2, memory2, gamma, batch_size, nb_steps_warmup_critic, nb_steps_warmup_actor, train_interval, memory_interval, delta_range, delta_clip, random_process2, custom_model_objects, target_model_update, **kwargs) def compile(self, optimizer, metrics=[]): self.agent1.compile(clone_optimizer(optimizer), deepcopy(metrics)) self.agent2.compile(clone_optimizer(optimizer), deepcopy(metrics)) def fit(self, env, nb_steps, action_repetition=1, callbacks=None, verbose=1, visualize=False, nb_max_start_steps=0, start_step_policy=None, log_interval=10000, nb_max_episode_steps=None): """Trains the agent on the given environment. # Arguments env: (`Env` instance): Environment that the agent interacts with. See [Env](#env) for details. nb_steps (integer): Number of training steps to be performed. action_repetition (integer): Number of times the agent repeats the same action without observing the environment again. Setting this to a value > 1 can be useful if a single action only has a very small effect on the environment. callbacks (list of `keras.callbacks.Callback` or `rl.callbacks.Callback` instances): List of callbacks to apply during training. See [callbacks](/callbacks) for details. verbose (integer): 0 for no logging, 1 for interval logging (compare `log_interval`), 2 for episode logging visualize (boolean): If `True`, the environment is visualized during training. However, this is likely going to slow down training significantly and is thus intended to be a debugging instrument. nb_max_start_steps (integer): Number of maximum steps that the agent performs at the beginning of each episode using `start_step_policy`. Notice that this is an upper limit since the exact number of steps to be performed is sampled uniformly from [0, max_start_steps] at the beginning of each episode. start_step_policy (`lambda observation: action`): The policy to follow if `nb_max_start_steps` > 0. If set to `None`, a random action is performed. log_interval (integer): If `verbose` = 1, the number of steps that are considered to be an interval. nb_max_episode_steps (integer): Number of steps per episode that the agent performs before automatically resetting the environment. Set to `None` if each episode should run (potentially indefinitely) until the environment signals a terminal state. # Returns A `keras.callbacks.History` instance that recorded the entire training process. """ if not (self.agent1.compiled and self.agent2.compiled): raise RuntimeError( 'Your tried to fit your agent but it hasn\'t been compiled yet. Please call `compile()` before `fit()`.') if action_repetition < 1: raise ValueError('action_repetition must be >= 1, is {}'.format(action_repetition)) assert self.processor is None # Removed processors here for simplification. Not needed anyway assert nb_max_start_steps == 0 # Removed here for simplification. Not needed anyway assert action_repetition == 1 # Removed here for simplification. Not needed anyway self.agent1.training = True self.agent2.training = True experience_for_plotting = deque() callbacks = [] if not callbacks else callbacks[:] if verbose == 1: callbacks += [TrainIntervalLogger(interval=log_interval)] elif verbose > 1: callbacks += [TrainEpisodeLogger()] if visualize: callbacks += [Visualizer()] history = History() callbacks += [history] callbacks = CallbackList(callbacks) if hasattr(callbacks, 'set_model'): callbacks.set_model(self) else: callbacks._set_model(self) callbacks._set_env(env) params = { 'nb_steps': nb_steps, } if hasattr(callbacks, 'set_params'): callbacks.set_params(params) else: callbacks._set_params(params) self.agent1._on_train_begin() self.agent2._on_train_begin() callbacks.on_train_begin() episode = np.int16(0) self.agent1.step = np.int16(0) self.agent2.step = np.int16(0) observation1 = observation2 = None episode_reward1 = None episode_reward2 = None episode_step = None did_abort = False try: while self.agent1.step < nb_steps: # not individual for now if observation1 is None or observation2 is None: # start of a new episode callbacks.on_episode_begin(episode) episode_step = np.int16(0) episode_reward1 = np.float32(0) episode_reward2 = np.float32(0) # Obtain the initial observation by resetting the environment. self.agent1.reset_states() self.agent2.reset_states() obs = env.reset() observation1 = deepcopy(obs) + (0.,) observation2 = deepcopy(obs) + (0.,) # At this point, we expect to be fully initialized. assert episode_reward1 is not None assert episode_reward2 is not None assert episode_step is not None assert observation1 is not None assert observation2 is not None # Run a single step. callbacks.on_step_begin(episode_step) # This is were all of the work happens. We first perceive and compute the action # (forward step) and then use the reward to improve (backward step). action1 = np.ndarray.item(self.agent1.forward(observation1)) action2 = np.ndarray.item(self.agent2.forward(observation2)) action = (action1, action2) reward1 = np.float32(0) reward2 = np.float32(0) accumulated_info = {} done = False callbacks.on_action_begin(action) # Use only one of the actions? added actions? obs, r, done, info = env.step(action) if done: raise AttributeError # The episode was reset unexpectedly # (see https://stackoverflow.com/questions/42787924/) observation1 = deepcopy(obs) + (info["u2_clipped"],) # Add action other to the observation observation2 = deepcopy(obs) + (info["u1_clipped"],) for key, value in info.items(): if not np.isreal(value): continue if key not in accumulated_info: accumulated_info[key] = np.zeros_like(value) accumulated_info[key] += value callbacks.on_action_end(action) reward1 += info["r1"] reward2 += info["r2"] if nb_max_episode_steps and episode_step >= nb_max_episode_steps - 1: # Force a terminal state. done = True metrics1 = self.agent1.backward(reward1, terminal=done) metrics2 = self.agent2.backward(reward2, terminal=done) episode_reward1 += reward1 episode_reward2 += reward2 step_logs = { 'action': action[0] + action[1], 'observation': observation1, 'reward': reward1 + reward2, 'metrics': metrics1, # not individual for now 'episode': episode, 'info': accumulated_info, } callbacks.on_step_end(episode_step, step_logs) episode_step += 1 self.agent1.step += 1 self.agent2.step += 1 if len(obs) == 2: experience_for_plotting.append((info["t"], obs, (info["u1_clipped"], info["u2_clipped"]), (0., 0.), r, (info["r1"], info["r2"]))) if done: # We are in a terminal state but the agent hasn't yet seen it. We therefore # perform one more forward-backward call and simply ignore the action before # resetting the environment. We need to pass in `terminal=False` here since # the *next* state, that is the state of the newly reset environment, is # always non-terminal by convention. self.agent1.forward(observation1) self.agent2.forward(observation2) self.agent1.backward(0., terminal=False) self.agent2.backward(0., terminal=False) # This episode is finished, report and reset. episode_logs = { 'episode_reward': episode_reward1 + episode_reward2, 'nb_episode_steps': episode_step, 'nb_steps': self.agent1.step, # not individual for now } callbacks.on_episode_end(episode, episode_logs) episode += 1 observation1 = None observation2 = None episode_step = None episode_reward1 = None episode_reward2 = None except KeyboardInterrupt: # We catch keyboard interrupts here so that training can be be safely aborted. # This is so common that we've built this right into this function, which ensures that # the `on_train_end` method is properly called. did_abort = True callbacks.on_train_end(logs={'did_abort': did_abort}) self.agent1._on_train_end() self.agent2._on_train_end() return experience_for_plotting

Exemplo n.º 2

0

Exibir arquivo

class CoopDDPG(Agent): # Two Agents, who can not measure the output of the other (Based on Keras-rl agent impl.) def forward(self, observation): raise NotImplementedError def backward(self, reward, terminal): raise NotImplementedError def load_weights(self, filepath): raise NotImplementedError def save_weights(self, filepath, overwrite=False): raise NotImplementedError @property def layers(self): raise NotImplementedError def __init__(self, nb_actions, actor1, actor2, critic1, critic2, critic_action_input1, critic_action_input2, memory1, memory2, gamma=.99, batch_size=32, nb_steps_warmup_critic=1000, nb_steps_warmup_actor=1000, train_interval=1, memory_interval=1, delta_range=None, delta_clip=np.inf, random_process1=None, random_process2=None, custom_model_objects={}, target_model_update=.001, **kwargs): super(CoopDDPG, self).__init__() self.agent1 = DDPGAgent(nb_actions, actor1, critic1, critic_action_input1, memory1, gamma, batch_size, nb_steps_warmup_critic, nb_steps_warmup_actor, train_interval, memory_interval, delta_range, delta_clip, random_process1, custom_model_objects, target_model_update, **kwargs) self.agent2 = DDPGAgent(nb_actions, actor2, critic2, critic_action_input2, memory2, gamma, batch_size, nb_steps_warmup_critic, nb_steps_warmup_actor, train_interval, memory_interval, delta_range, delta_clip, random_process2, custom_model_objects, target_model_update, **kwargs) def compile(self, optimizer, metrics=[]): self.agent1.compile(clone_optimizer(optimizer), deepcopy(metrics)) self.agent2.compile(clone_optimizer(optimizer), deepcopy(metrics)) def fit(self, env, nb_steps, action_repetition=1, callbacks=None, verbose=1, visualize=False, nb_max_start_steps=0, start_step_policy=None, log_interval=10000, nb_max_episode_steps=None): if not (self.agent1.compiled and self.agent2.compiled): raise RuntimeError( 'Your tried to fit your agent but it hasn\'t been compiled yet. Please call `compile()` before `fit()`.') if action_repetition < 1: raise ValueError('action_repetition must be >= 1, is {}'.format(action_repetition)) self.agent1.training = True self.agent2.training = True callbacks = [] if not callbacks else callbacks[:] if verbose == 1: callbacks += [TrainIntervalLogger(interval=log_interval)] elif verbose > 1: callbacks += [TrainEpisodeLogger()] if visualize: callbacks += [Visualizer()] history = History() callbacks += [history] callbacks = CallbackList(callbacks) if hasattr(callbacks, 'set_model'): callbacks.set_model(self) else: callbacks._set_model(self) callbacks._set_env(env) params = { 'nb_steps': nb_steps, } if hasattr(callbacks, 'set_params'): callbacks.set_params(params) else: callbacks._set_params(params) self.agent1._on_train_begin() self.agent2._on_train_begin() callbacks.on_train_begin() episode = np.int16(0) self.agent1.step = np.int16(0) self.agent2.step = np.int16(0) observation = None episode_reward1 = None episode_reward2 = None episode_step = None did_abort = False try: while self.agent1.step < nb_steps: # not individual for now if observation is None: # start of a new episode callbacks.on_episode_begin(episode) episode_step = np.int16(0) episode_reward1 = np.float32(0) episode_reward2 = np.float32(0) # Obtain the initial observation by resetting the environment. self.agent1.reset_states() self.agent2.reset_states() observation = deepcopy(env.reset()) if self.agent1.processor is not None: # not individual for now observation = self.agent1.processor.process_observation(observation) assert observation is not None # Perform random starts at beginning of episode and do not record them into the experience. # This slightly changes the start position between games. nb_random_start_steps = 0 if nb_max_start_steps == 0 else np.random.randint(nb_max_start_steps) for _ in range(nb_random_start_steps): if start_step_policy is None: action = env.action_space.sample() else: action = start_step_policy(observation) if self.agent1.processor is not None: # not individual for now. action is not from agent anyway action = self.agent1.processor.process_action(action) callbacks.on_action_begin(action) observation, reward, done, info = env.step(action) observation = deepcopy(observation) if self.agent1.processor is not None: observation, reward, done, info = self.agent1.processor.process_step(observation, reward, done, info) callbacks.on_action_end(action) if done: warnings.warn( 'Env ended before {} random steps could be performed at the start. ' 'You should probably lower the `nb_max_start_steps` parameter.'.format( nb_random_start_steps)) observation = deepcopy(env.reset()) if self.agent1.processor is not None: observation = self.agent1.processor.process_observation(observation) break # At this point, we expect to be fully initialized. assert episode_reward1 is not None assert episode_reward2 is not None assert episode_step is not None assert observation is not None # Run a single step. callbacks.on_step_begin(episode_step) # This is were all of the work happens. We first perceive and compute the action # (forward step) and then use the reward to improve (backward step). action1 = self.agent1.forward(observation) action2 = self.agent2.forward(observation) if self.agent1.processor is not None: action1 = self.agent1.processor.process_action(action1) if self.agent2.processor is not None: action2 = self.agent2.processor.process_action(action2) action = (np.ndarray.item(action1), np.ndarray.item(action2)) reward1 = np.float32(0) reward2 = np.float32(0) reward = np.float32(0) accumulated_info = {} done = False for _ in range(action_repetition): callbacks.on_action_begin(action) # Use only one of the actions? added actions? observation, r, done, info = env.step(action) observation = deepcopy(observation) if self.agent1.processor is not None: observation, r, done, info = self.agent1.processor.process_step(observation, r, done, info) for key, value in info.items(): if not np.isreal(value): continue if key not in accumulated_info: accumulated_info[key] = np.zeros_like(value) accumulated_info[key] += value callbacks.on_action_end(action) reward1 += info["r1"] reward2 += info["r2"] reward += info["r1"] + info["r2"] if done: break if nb_max_episode_steps and episode_step >= nb_max_episode_steps - 1: # Force a terminal state. done = True metrics1 = self.agent1.backward(reward1, terminal=done) metrics2 = self.agent2.backward(reward2, terminal=done) episode_reward1 += reward1 episode_reward2 += reward2 step_logs = { 'action': action, 'observation': observation, 'reward': reward, 'metrics': metrics1, # not individual for now 'episode': episode, 'info': accumulated_info, } callbacks.on_step_end(episode_step, step_logs) episode_step += 1 self.agent1.step += 1 self.agent2.step += 1 if done: # We are in a terminal state but the agent hasn't yet seen it. We therefore # perform one more forward-backward call and simply ignore the action before # resetting the environment. We need to pass in `terminal=False` here since # the *next* state, that is the state of the newly reset environment, is # always non-terminal by convention. self.agent1.forward(observation) self.agent2.forward(observation) self.agent1.backward(0., terminal=False) self.agent2.backward(0., terminal=False) # This episode is finished, report and reset. episode_logs = { 'episode_reward': episode_reward1 + episode_reward2, 'nb_episode_steps': episode_step, 'nb_steps': self.agent1.step, # not individual for now } callbacks.on_episode_end(episode, episode_logs) episode += 1 observation = None episode_step = None episode_reward1 = None episode_reward2 = None except KeyboardInterrupt: # We catch keyboard interrupts here so that training can be be safely aborted. # This is so common that we've built this right into this function, which ensures that # the `on_train_end` method is properly called. did_abort = True callbacks.on_train_end(logs={'did_abort': did_abort}) self.agent1._on_train_end() self.agent2._on_train_end() return history