class agent(): """Reinforcement Learning agent that learns using DDPG.""" def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 self.exploration_theta = .15 self.exploration_sigma = .2 # self.exploration_theta = 0.15 # self.exploration_sigma = 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 #used to be 100000 self.batch_size = 64 #used to be 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = .001 # for soft update of target parameters (was 0.01) def reset_episode(self): self.total_reward = 0.0 self.count = 0 self.noise.reset() state = self.task.reset() self.last_state = state return state def step(self, action, reward, next_state, done): self.total_reward += reward self.count += 1 self.score = self.total_reward # / float(self.count) if self.count else 0.0 # Save experience / reward self.memory.add(self.last_state, action, reward, next_state, done) # Learn, if enough samples are available in memory if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) # Roll over last state and action self.last_state = next_state def act(self, state): """Returns actions for given state(s) as per current policy.""" state = np.reshape(state, [-1, self.state_size]) action = self.actor_local.model.predict(state)[0] return list(action + self.noise.sample()) # add some noise for exploration def learn(self, experiences): """Update policy and value parameters using given batch of experience tuples.""" # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.) states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape( -1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None ]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack( [e.next_state for e in experiences if e is not None]) # Get predicted next-state actions and Q values from target models # Q_targets_next = critic_target(next_state, actor_target(next_state)) actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch( [next_states, actions_next]) # Compute Q targets for current states and train critic model (local) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) # Train actor model (local) action_gradients = np.reshape( self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) # custom training function # Soft-update target models self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) def soft_update(self, local_model, target_model): """Soft update model parameters.""" local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len( target_weights ), "Local and target model parameters must have the same size" new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights)
class DDPG(): """Reinforcement Learning agent that learns using DDPG.""" def __init__(self, state_size, action_size, train=True): self.train = train self.action_size = action_size self.state_size = state_size actor_lr = 0.001 #Learning rate for Actor 0.0001 critic_lr = 0.01 #Lerning rate for Critic 0.001 deep_lr = 1e-3 # Noise process self.exploration_mu = 0 # Mean self.exploration_theta = 0.6 #.15 How fast variable reverts to mean self.exploration_sigma = 0.3 # .2 Degree of volatility self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) if (self.train): # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, actor_lr) self.actor_target = Actor(self.state_size, self.action_size, actor_lr) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size, critic_lr) self.critic_target = Critic(self.state_size, self.action_size, critic_lr) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Replay memory self.buffer_size = 300 #1024 self.batch_size = 32 #32 # internal memory (deque) self.memory = deque(maxlen=self.buffer_size) #self.memory = [] self.experience = namedtuple("Data", field_names=[ "state", "action", "reward", "next_state", "done" ]) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.01 # for soft update of target parameters 0.001 self.guide = False print("DDPG init", "Actor: ", actor_lr, "Critic: ", critic_lr) #print("Tau: ", self.tau, "Sigma: ", self.exploration_sigma) print(self.actor_local.model.summary()) print(self.critic_local.model.summary()) self.batch_id = 0 self.critic_loss = 0 self.actor_loss = 0 self.C_loss = [] self.A_loss = [] def save_model(self, num): # Save the weights weights-improvement--0.03.hdf5 load_str = "weights-improvement--0.{}.hdf5".format(num) self.deep_NN.model.load_weights(load_str) self.deep_NN.model.save("./model/model.h5") print("Saved model with best weights to disk") def load_model(self, name): # Save the weights self.deep_NN.model.load_weights(name) def summarize_prediction(self, Y_true, Y_pred): mse = mean_squared_error(Y_true, Y_pred) r_squared = r2_score(Y_true, Y_pred) print("mse = {0:.2f}".format(mse)) print("r_squared = {0:.2f}%".format(r_squared)) def predict_and_summarize(self, X, Y): model = load_model("./model/model.h5") Y_pred = model.predict(X).astype('int') self.summarize_prediction(Y, Y_pred) return Y_pred def predict(self, state): """Returns actions for given state(s) as per current policy.""" #state = np.reshape(state, [-1, self.state_size]) #action = self.trained.model.predict(state)[0] noise = self.noise.sample() action = self.actor_target.model.predict(state) return action, noise def get_sample(self, b_size=None): if (b_size is None): b_size = self.batch_size return rn.sample(self.memory, k=b_size) def conv_to_tensor(self, img): # Black and White Image ex: 1, 244, 244, 1 if (len(img.shape) == 2): img = np.expand_dims(img, axis=3) img = np.expand_dims(img, axis=0) # RGB Image or stacked image: 1, 244, 244, 3 elif (len(img.shape) == 3): img = np.expand_dims(img, axis=0) return img def reset(self): self.critic_loss = 0 self.memory.clear() def step(self, state, action, reward, next_state, done): d = self.experience(state, action, reward, next_state, done) if (len(self.memory) == self.buffer_size): self.memory.popleft() self.memory.append(d) def learn(self, verbose=False): #experiences """Update policy and value parameters using given batch of experience tuples.""" if (len(self.memory) < self.batch_size): return experiences = self.get_sample() if (verbose): print("Buffer Size: ", len(self.memory)) print("Sample Size: ", len(experiences)) # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.) states = np.vstack([ self.conv_to_tensor(e.state) for e in experiences if e is not None ]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape( -1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None ]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack([ self.conv_to_tensor(e.next_state) for e in experiences if e is not None ]) if (0): print("States", states.shape) print("Actions", actions.shape) print("Rewards", rewards.shape) print("Next States", next_states.shape) print("Dones", dones.shape) # keep training actor local and critic local # use values from target model to update and train local # don't train target models, we soft update target actions_next = self.actor_target.model.predict_on_batch(next_states) #print("Actions next", actions_next.shape) Q_targets_next = self.critic_target.model.predict_on_batch( [next_states, actions_next]) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.critic_loss = self.critic_local.model.train_on_batch( x=[states, actions], y=Q_targets) # Train actor model (local) action_gradients = np.reshape( self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_loss = self.actor_local.train_fn( [states, action_gradients, 1]) # custom training function self.A_loss.append(self.actor_loss) self.C_loss.append(self.critic_loss) #self.batch_id += 1 # Soft-update target models self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) def soft_update(self, local_model, target_model): """Soft update model parameters.""" local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len( target_weights ), "Local and target model parameters must have the same size" new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights)
class DDPGAgent(): """Reinforcement Learning agent that learns using DDPG.""" def __init__(self, state_size, action_size, action_low, action_high): # self.task = task self.state_size = state_size self.action_size = action_size self.action_low = action_low self.action_high = action_high # learning rates self.lr_actor = 1e-4 self.lr_critic = 1e-3 # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.lr_actor) self.actor_target = Actor(self.state_size, self.action_size, self.lr_actor) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size, self.lr_critic) self.critic_target = Critic(self.state_size, self.action_size, self.lr_critic) # store model architecture of actor and critic locally # keras.utils.plot_model(self.actor_local.model, '/home/danie/catkin_ws/src/ddpg/src/actor.png', show_shapes=True) # keras.utils.plot_model(self.critic_local.model, '/home/danie/catkin_ws/src/ddpg/src/critic.png', show_shapes=True) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights(self.critic_local.model.get_weights()) self.actor_target.model.set_weights(self.actor_local.model.get_weights()) # Initialize OU noise self.noise = OUNoise(action_size=self.action_size) # Currently testing with Gaussian noise instead of OU. Parameters for Gaussian follow self.noise_mean = 0.0 self.noise_stddev = 0.2 # Initialize replay buffer self.buffer_size = 1e6 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Parameters for DDPG self.gamma = 0.99 # discount factor self.tau = 0.001 # for soft update of target parameters def reset_episode(self): self.noise.reset() def choose_action(self, state): """Returns actions for given state(s) as per current policy.""" pure_action = self.actor_local.model.predict(state)[0] # add gaussian noise for exploration # noise = np.random.normal(self.noise_mean, self.noise_stddev, self.action_size) # add OU noise for exploration noise = self.noise.sample() # action = np.clip(pure_action + noise, self.action_low, self.action_high) # print("pure", pure_action) # print("noise", noise) # action = self.action_high * (pure_action + noise) # action = pure_action + noise action = np.clip(pure_action + noise, self.action_low, self.action_high) # print("action", action) return action.tolist() def store_transition(self, state, action, reward, next_state, done): # Save experience / reward self.memory.add(state, action, reward, next_state, done) def train_actor_and_critic(self): """ Update policy and value parameters using given batch of experience tuples. """ # if not enough transitions in memory, don't train! if len(self.memory) < self.batch_size: return transitions = self.memory.sample() # sample a batch from memory # Convert experience tuples to separate arrays for each element # (states, actions, rewards, etc.) states = np.vstack([e.state for e in transitions if e is not None]) actions = np.array([ e.action for e in transitions if e is not None]).astype( np.float32).reshape(-1, self.action_size) rewards = np.array([ e.reward for e in transitions if e is not None]).astype( np.float32).reshape(-1, 1) dones = np.array([ e.done for e in transitions if e is not None]).astype( np.uint8).reshape(-1, 1) next_states = np.vstack( [e.next_state for e in transitions if e is not None]) # Get predicted next-state actions and Q values from target models actions_next = self.actor_target.model.predict_on_batch(next_states) #mu_marked in algo Q_targets_next = self.critic_target.model.predict_on_batch([next_states, actions_next]) #Q' in algo # Compute Q targets for current states and train critic model (local) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) #y_i in algo critic_loss = self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) # Train actor model (local) action_gradients = np.reshape(self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) # print("action_gradients",action_gradients) # custom training function self.actor_local.train_fn([states, action_gradients, 1]) # Soft-update target models # self.soft_update(self.critic_local.model, self.critic_target.model, self.tau) # self.soft_update(self.actor_local.model, self.actor_target.model, self.tau) self.soft_update_critic() self.soft_update_actor() return critic_loss def soft_update_actor(self): """Soft update model parameters.""" local_weights = np.array(self.actor_local.model.get_weights()) target_weights = np.array(self.actor_target.model.get_weights()) assert len(local_weights) == len( target_weights), ('Local and target model parameters must have ' 'the same size') new_weights = self.tau * local_weights + (1 - self.tau) * target_weights self.actor_target.model.set_weights(new_weights) def soft_update_critic(self): """Soft update model parameters.""" local_weights = np.array(self.critic_local.model.get_weights()) target_weights = np.array(self.critic_target.model.get_weights()) assert len(local_weights) == len( target_weights), ('Local and target model parameters must have ' 'the same size') new_weights = self.tau * local_weights + (1 - self.tau) * target_weights self.critic_target.model.set_weights(new_weights) def soft_update(self, local_model, target_model, tau): """Soft update model parameters.""" local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len( target_weights), ('Local and target model parameters must have ' 'the same size') new_weights = tau * local_weights + (1 - tau) * target_weights target_model.set_weights(new_weights)
class DDPG(): """Reinforcement Learning agent that learns using DDPG.""" def __init__(self, task, train=True): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high self.actor_lr = 1e-5 #.0001 self.critic_lr = 1e-4 #0.0000001 self.network = [128, 256, 128] self.train = train network = self.network actor_lr = self.actor_lr critic_lr = self.critic_lr if (self.train): # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high, actor_lr, network) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high, actor_lr, network) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size, critic_lr, network) self.critic_target = Critic(self.state_size, self.action_size, critic_lr, network) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 # Mean self.exploration_theta = 0.15 #.15 How fast variable reverts to mean self.exploration_sigma = 0.2 #.2 Degree of volatility self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 5000 self.batch_size = 16 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) self.targets = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.01 # for soft update of target parameters print("DDPG init", "Actor: ", actor_lr, "Critic: ", critic_lr) print("Tau: ", self.tau, "Sigma: ", self.exploration_sigma) print(self.actor_local.model.summary()) print(self.critic_local.model.summary()) # https://stackoverflow.com/questions/44861149/keras-use-tensorboard-with-train-on-batch?rq=1 # Create the TensorBoard callback, # which we will drive manually self.tensorboard = keras.callbacks.TensorBoard( log_dir='logdir', histogram_freq=0, batch_size=self.batch_size, write_graph=True, write_grads=True) self.tensorboard.set_model(self.critic_local.model) self.summary_writer = tf.summary.FileWriter("scores") self.batch_id = 0 def reset_episode(self): if (self.train): self.noise.reset() self.noise_arr = [] self.noise_matrix = [0., 0., 0., 0.] state = self.task.reset() self.last_state = state return state def save_initial_weights(self): self.actor_local.model.save_weights('actor_local.h5') self.actor_target.model.save_weights('actor_target.h5') self.critic_local.model.save_weights('critic_local.h5') self.critic_target.model.save_weights('critic_target.h5') def load_initial_weights(self): self.actor_local.model.load_weights('actor_local.h5') self.actor_target.model.load_weights('actor_target.h5') self.critic_local.model.load_weights('critic_local.h5') self.critic_target.model.load_weights('critic_target.h5') def save_model(self): # Save the weights self.actor_local.model.save_weights('model_weights.h5') def load_weights(self, option=None): if (option == None): self.trained = Actor(self.state_size, self.action_size, self.action_low, self.action_high, self.actor_lr, self.network) self.trained.model.load_weights('model_weights.h5') else: self.trained = Actor(self.state_size, self.action_size, self.action_low, self.action_high, self.actor_lr, self.network) self.trained.model.load_weights('weights-best.hdf5') print(self.trained.model.summary()) def predict(self, state): """Returns actions for given state(s) as per current policy.""" state = np.reshape(state, [-1, self.state_size]) action = self.trained.model.predict(state)[0] return action def step(self, action, reward, next_state, done): # Save experience / reward self.memory.add(self.last_state, action, reward, next_state, done) # Learn, if enough samples are available in memory if (len(self.memory) > self.batch_size * 2): experiences = self.memory.sample() self.learn(experiences) if (len(self.memory) == self.buffer_size): self.memory.memory.clear() print("buffer cleared") # Roll over last state and action self.last_state = next_state def act(self, state): """Returns actions for given state(s) as per current policy.""" state = np.reshape(state, [-1, self.state_size]) noise = self.noise.sample() action = list(self.actor_local.model.predict(state)[0] + noise) return action, noise # add some noise for exploration def learn(self, experiences): #experiences """Update policy and value parameters using given batch of experience tuples.""" # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.) states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape( -1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None ]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack( [e.next_state for e in experiences if e is not None]) ''' print("States", states.shape) print("actions", actions.shape) print("rewards", rewards.shape) print("dones", dones.shape) print("Next states", next_states.shape) ''' # keep training actor local and critic local # use values from target model to update and train local # don't train target models, we soft update target # Get predicted next-state actions and Q values from target models # Q_targets_next = critic_target(next_state, actor_target(next_state)) actions_next = self.actor_target.model.predict_on_batch( next_states) #target #Actions predicted by target critic Q_targets_next = self.critic_target.model.predict_on_batch( [next_states, actions_next]) #target # Compute Q targets for current states and train critic model (local) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) critic_loss = self.critic_local.model.train_on_batch( x=[states, actions], y=Q_targets) # Train actor model (local) action_gradients = np.reshape( self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) actor_loss = self.actor_local.train_fn([states, action_gradients, 1]) # custom training function self.tensorboard.on_epoch_end( self.batch_id, named_logs(self.critic_local.model, [critic_loss])) self.batch_id += 1 # Soft-update target models self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) def soft_update(self, local_model, target_model): """Soft update model parameters.""" local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len( target_weights ), "Local and target model parameters must have the same size" new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights)
class DDPG_cartpole(): """Reinforcement Learning agent that learns using DDPG.""" def __init__(self, state_size, action_size): self.epsilon = 0.8 self.state_size = state_size self.action_size = action_size # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size) self.actor_target = Actor(self.state_size, self.action_size) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process # self.exploration_mu = 0 # self.exploration_theta = 0.15 # self.exploration_sigma = 0.2 # self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 20000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.95 # discount factor self.tau = 0.002 # for soft update of target parameters self.stats = np.array([]) # def reset_episode(self): # self.noise.reset() # state = self.task.reset() # self.last_state = state # return state def step(self, action, reward, next_state, done): # Save experience / reward # if score > 2: self.memory.add(self.last_state, action, reward, next_state, done) # Learn, if enough samples are available in memory # if len(self.memory) > self.batch_size: if len(self.memory) > 200: experiences = self.memory.sample(batch_size=self.batch_size) lossarr = self.learn(experiences) return lossarr # Roll over last state and action self.last_state = next_state return 0 def act(self, state, epsilon=0.1, epsilong_min=0.05): """Returns actions for given state(s) as per current policy.""" self.epsilon = epsilon if self.epsilon < epsilong_min: # print('epsilon', epsilon) self.epsilon = epsilong_min state = np.reshape(state, [-1, self.state_size]) act_prob = self.actor_local.model.predict(state)[0] # if np.random.rand() > self.epsilon: # action = np.argmax(act_prob) # else: # action = np.random.binomial(1, 0.5, 1)[0] if np.random.rand() > self.epsilon: pass else: # act_prob = act_prob + np.random.randn(2)*self.epsilon*10 action = np.random.binomial(1, 0.5, 1)[0] act_prob = np.zeros(2) act_prob[action] = 1 # return list(action + self.noise.sample()) # add some noise for exploration # acvtion_ = np.zeros(self.action_size) # acvtion_[action] = 1 return act_prob def learn(self, experiences): """Update policy and value parameters using given batch of experience tuples.""" # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.) states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape( -1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None ]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack( [e.next_state for e in experiences if e is not None]) # Get predicted next-state actions and Q values from target models # Q_targets_next = critic_target(next_state, actor_target(next_state)) actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch( [next_states, actions_next]) # Compute Q targets for current states and train critic model (local) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) # self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) history = self.critic_local.model.fit(x=[states, actions], y=Q_targets, batch_size=self.batch_size, epochs=1, verbose=0) # Train actor model (local) action_gradients = np.reshape( self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) # calculate statistics statistics = np.array([ action_gradients.max(), action_gradients.min(), action_gradients.mean() ]) self.stats = np.vstack([self.stats, statistics ]) if self.stats.size else statistics self.actor_local.train_fn([states, action_gradients, 1]) # custom training function # Soft-update target models # self.update_target() self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) return history.history['loss'] # separate this out for each episod, not each step. def update_target(self): self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) def soft_update(self, local_model, target_model): """Soft update model parameters.""" local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len( target_weights ), "Local and target model parameters must have the same size" new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights)
class DDPG(): """Reinforcement Learning agent that learns using DDPG.""" def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high #print("initializing DDPG agent") # Actor (Policy) Model #print("initializing actor_local") self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) #print("initializing actor_target") self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model #print("initializing critic_local") self.critic_local = Critic(self.state_size, self.action_size) #print("initializing critic_target") self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights(self.critic_local.model.get_weights()) self.actor_target.model.set_weights(self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 1.0 # initial value was 0 self.exploration_theta = 0.5 # initial value was 0.15 self.exploration_sigma = 0.15 # initial value was 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 # initially this was buffer_size = 100000 self.batch_size = 64 # initial was 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.95 # discount factor initial value was 0.99 self.tau = 0.05 # for soft update of target parameters initial value was 0.01 def reset_episode(self): self.noise.reset() state = self.task.reset() self.last_state = state return state def step(self, action, reward, next_state, done): # Save experience / reward self.memory.add(self.last_state, action, reward, next_state, done) # Learn, if enough samples are available in memory if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) # Roll over last state and action self.last_state = next_state def act(self, states): """Returns actions for given state(s) as per current policy.""" state = np.reshape(states, [-1, self.state_size]) #print("\nactor act sees state as {}\n".format(state)) action = self.actor_local.model.predict(state)[0] return list(action + self.noise.sample()) # add some noise for exploration def learn(self, experiences): #print("we are about to learn from our experiences") #print(experiences) """Update policy and value parameters using given batch of experience tuples.""" # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.) states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape(-1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack([e.next_state for e in experiences if e is not None]) # Get predicted next-state actions and Q values from target models # Q_targets_next = critic_target(next_state, actor_target(next_state)) actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch([next_states, actions_next]) # Compute Q targets for current states and train critic model (local) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) # Train actor model (local) # the below line was throwing an error when running with tg-gpu2 and python2.7 #action_gradients = np.reshape(self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) print("agent.py in learn method") print(states[1]) print(actions[0]) print("Python2.7 agent.py saw those for states and actions") #a_g = self.critic_local.get_action_gradients([states, actions, 0]) #print(type(a_g[0])) #print(a_g[0]) #print("Python2.7 agent.py saw that action gradient") action_gradients = np.reshape(self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) # custom training function # Soft-update target models self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) def soft_update(self, local_model, target_model): #print("\ntime to update the target model") """Soft update model parameters.""" local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len(target_weights), "Local and target model parameters must have the same size" new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights)