class DDPG(): """Reinforcement Learning agent that learns using DDPG.""" def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 # self.exploration_theta = 0.085 # self.exploration_sigma = 0.15 self.exploration_theta = 0.070 self.exploration_sigma = 0.20 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.70 # discount factor self.tau = 0.01 # for soft update of target parameters def reset_episode(self): self.noise.reset() state = self.task.reset() self.last_state = state return state def step(self, action, reward, next_state, done): # Save experience / reward self.memory.add(self.last_state, action, reward, next_state, done) # Learn, if enough samples are available in memory if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) # Roll over last state and action self.last_state = next_state def act(self, states): """Returns actions for given state(s) as per current policy.""" state = np.reshape(states, [-1, self.state_size]) action = self.actor_local.model.predict(state)[0] return list(action + self.noise.sample()) # add some noise for exploration def learn(self, experiences): """Update policy and value parameters using given batch of experience tuples.""" # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.) states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape( -1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None ]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack( [e.next_state for e in experiences if e is not None]) # Get predicted next-state actions and Q values from target models # Q_targets_next = critic_target(next_state, actor_target(next_state)) actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch( [next_states, actions_next]) # Compute Q targets for current states and train critic model (local) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) # Train actor model (local) action_gradients = np.reshape( self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) # custom training function # Soft-update target models self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) def soft_update(self, local_model, target_model): """Soft update model parameters.""" local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len( target_weights ), "Local and target model parameters must have the same size" new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights)
class DQLagent(): """Reinforcement Learning agent that learns using a DQL network.""" def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size # Exploration parameters self.decay_max = 1.0 # exploration probability at start self.decay_min = 0.01 # minimum exploration probability self.decay_rate = 0.0001 # exponential decay rate for exploration prob self.decay_step = np.exp(-self.decay_rate) self.decay_range = self.decay_max - self.decay_min self.decay_factor = 1. self.explore_p = self.decay_max # Network parameters self.learning_rate = 0.0001 # Q-network learning rate #self.learning_rate = 0.001 # Q-network learning rate # Replay memory self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor # Score tracker and learning parameters self.best_score = -np.inf self.score = -np.inf self.loss = 0 self.qnet = QNetwork(self.state_size, self.action_size, name='main', learning_rate=self.learning_rate) # Episode variables self.reset_episode() def reset_episode(self,new_tgt_pos=None): self.total_reward = 0.0 self.count = 0 state = self.task.reset(new_tgt_pos) self.last_state = state return state def act(self, tfsess, state): """Returns actions for given state(s) as per current policy.""" # Explore or Exploit if len(self.memory) > self.batch_size: # epsilon-greedy policy: self.decay_factor *= self.decay_step self.explore_p = self.decay_min + (self.decay_range*self.decay_factor) if self.explore_p > np.random.rand(): # Make a random action actions = np.random.randint(0,self.action_size) else: # Get actions from Q-network feed = {self.qnet.inputs_: state.reshape((1, *state.shape))} Qs = tfsess.run(self.qnet.output, feed_dict=feed) actions = np.argmax(Qs) else: # pick actions equi-probablistically actions = np.random.randint(0,self.action_size) return actions def step(self, tfsess, action, # int reward, # np.ndarray (action_repeat,) next_state, # np.ndarray (state_size*action_repeat,) done): # bool # Save experience / reward self.memory.add(self.last_state, action, np.mean(reward), next_state, done) # Save experience / reward self.total_reward += np.mean(reward) self.count += 1 # Learn, if enough samples are available in memory if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(tfsess,experiences) # Roll over last state and action self.last_state = next_state def learn(self, tfsess, expbatch): """Update policy and value parameters using given batch of experience tuples.""" # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.) states = np.array([e.state for e in expbatch if e is not None]) actions = np.array([e.action for e in expbatch if e is not None]).astype(np.float32) rewards = np.array([e.reward for e in expbatch if e is not None]).astype(np.float32) dones = np.array([e.done for e in expbatch if e is not None]).astype(np.uint8) next_states = np.array([e.next_state for e in expbatch if e is not None]) # Train network target_Qs = tfsess.run(self.qnet.output, feed_dict={self.qnet.inputs_: next_states}) # Set target_Qs to 0 for states where episode ends target_Qs[dones] = np.zeros(self.action_size) targets = rewards + self.gamma * np.max(target_Qs, axis=1) self.loss, _ = tfsess.run([self.qnet.loss, self.qnet.opt], feed_dict={self.qnet.inputs_: states, self.qnet.targetQs_: targets, self.qnet.actions_: actions}) self.score = self.total_reward / float(self.count) if self.count else 0.0 if self.score > self.best_score: self.best_score = self.score
class DDPG(): """Reinforcement Learning agent that learns using DDPG.""" #name: is a name to use to save the netural Network models #load: load data from existing models or cretae an entirly new model def __init__(self, task, name, loadfile=False): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) self.name = name if loadfile: self.actor_local.model.load_weights("./weights/" + name + "_actor.h5") self.critic_local.model.load_weights("./weights/" + name + "_critic.h5") # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 self.exploration_theta = 0.15 #0.3 #original 0.15 self.exploration_sigma = 0.3 #0.3 #original 0.3 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 1000000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.001 # for soft update of target parameters def reset_episode(self): self.noise.reset() state = self.task.reset() self.last_state = state return state def step(self, action, reward, next_state, done): # Save experience / reward self.memory.add(self.last_state, action, reward, next_state, done) # Learn, if enough samples are available in memory if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) # Roll over last state and action self.last_state = next_state def act(self, states): """Returns actions for given state(s) as per current policy.""" state = np.reshape(states, [-1, self.state_size]) action = self.actor_local.model.predict(state)[0] return list(action + self.noise.sample()) # add some noise for exploration def learn(self, experiences): """Update policy and value parameters using given batch of experience tuples.""" # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.) states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape( -1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None ]).astype(np.float32).reshape(-1, 1) #rewards = np.interp(rewards, (rewards.min(), rewards.max()), (-1, +1)) #TESTING to scale rewards to a small number. dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack( [e.next_state for e in experiences if e is not None]) # Get predicted next-state actions and Q values from target models # Q_targets_next = critic_target(next_state, actor_target(next_state)) actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch( [next_states, actions_next]) # Compute Q targets for current states and train critic model (local) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) # Train actor model (local) action_gradients = np.reshape( self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) # custom training function # Soft-update target models self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) def save_weights(self): self.actor_local.model.save_weights("./weights/" + self.name + "_actor.h5") self.critic_local.model.save_weights("./weights/" + self.name + "_critic.h5") #Notice that after training over a batch of experiences, we could just copy our newly learned weights (from the local model) to the target model. #However, individual batches can introduce a lot of variance into the process, so it's better to perform a soft update, controlled by the parameter tau. def soft_update(self, local_model, target_model): """Soft update model parameters.""" local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len( target_weights ), "Local and target model parameters must have the same size" new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights)
class Agent(): def __init__(self, cfg): # Environment configuration self.action_shape = cfg['env']['action_shape'] # Replay memory cfg['agent']['memory']['action_shape'] = self.action_shape self.memory = ReplayBuffer(**cfg['agent']['memory']) # Algorithm parameters self.exploration_mu, self.exploration_sigma = cfg['agent']['noise'] self.gamma = cfg['agent']['gamma'] self.tau = cfg['agent']['tau'] state_flatten_shape = [np.prod(self.memory.flatten_state_shape)] # Actor Model self.actor = Actor(state_flatten_shape, self.action_shape, cfg['env']['action_range'], self.tau, self.memory.batch_size, cfg['actor']) # Critic Model self.critic = Critic(state_flatten_shape, self.action_shape, self.tau, cfg['critic']) # Flag & Counter self.training = True self.episode = 0 self.max_episode_explore = cfg['agent']['explore'] def init_actor_critic(self): # Initialize target model self.critic.copy_local_in_target() self.actor.copy_local_in_target() def step(self, action, reward, next_state, done): # Save experience / reward self.memory.add(self.last_state, action, reward, next_state, done, self.training) def act(self, state): self.last_state = state window_states = state.reshape(1, -1) action = self.actor.predict(window_states) if self.training and self.episode < self.max_episode_explore: p = self.episode / self.max_episode_explore action = p * action + (p - 1) * np.random.normal( self.exploration_mu, self.exploration_sigma) return np.clip(action.ravel(), a_max=900, a_min=0) def learn(self): if self.memory.is_sufficient(): experiences = self.memory.sample() states = experiences['state'][:, 0].reshape(self.memory.batch_size, -1) actions = experiences['action'][:, 0].reshape(self.memory.batch_size, -1) rewards = experiences['reward'] dones = experiences['done'] next_states = experiences['next_state'][:, 0].reshape( self.memory.batch_size, -1) # get predicted next state action and Q values from target models actions_next = self.actor.get_targets(next_states) Q_targets_next = self.critic.get_targets(next_states, actions_next) # Compute Q targets for current states and train critic model Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.critic.fit(states, actions, Q_targets) # Train actor model action_gradients = self.critic.get_actions_grad(states, actions)[0] self.actor.fit(states, action_gradients) # Soft-update target models self.critic.soft_update() self.actor.soft_update()