class DDPG(): """Reinforcement Learning agent that learns using DDPG.""" def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 # self.exploration_theta = 0.085 # self.exploration_sigma = 0.15 self.exploration_theta = 0.070 self.exploration_sigma = 0.20 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.70 # discount factor self.tau = 0.01 # for soft update of target parameters def reset_episode(self): self.noise.reset() state = self.task.reset() self.last_state = state return state def step(self, action, reward, next_state, done): # Save experience / reward self.memory.add(self.last_state, action, reward, next_state, done) # Learn, if enough samples are available in memory if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) # Roll over last state and action self.last_state = next_state def act(self, states): """Returns actions for given state(s) as per current policy.""" state = np.reshape(states, [-1, self.state_size]) action = self.actor_local.model.predict(state)[0] return list(action + self.noise.sample()) # add some noise for exploration def learn(self, experiences): """Update policy and value parameters using given batch of experience tuples.""" # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.) states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape( -1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None ]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack( [e.next_state for e in experiences if e is not None]) # Get predicted next-state actions and Q values from target models # Q_targets_next = critic_target(next_state, actor_target(next_state)) actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch( [next_states, actions_next]) # Compute Q targets for current states and train critic model (local) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) # Train actor model (local) action_gradients = np.reshape( self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) # custom training function # Soft-update target models self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) def soft_update(self, local_model, target_model): """Soft update model parameters.""" local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len( target_weights ), "Local and target model parameters must have the same size" new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights)
class DDPG(): """Reinforcement Learning agent that learns using DDPG.""" #name: is a name to use to save the netural Network models #load: load data from existing models or cretae an entirly new model def __init__(self, task, name, loadfile=False): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) self.name = name if loadfile: self.actor_local.model.load_weights("./weights/" + name + "_actor.h5") self.critic_local.model.load_weights("./weights/" + name + "_critic.h5") # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 self.exploration_theta = 0.15 #0.3 #original 0.15 self.exploration_sigma = 0.3 #0.3 #original 0.3 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 1000000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.001 # for soft update of target parameters def reset_episode(self): self.noise.reset() state = self.task.reset() self.last_state = state return state def step(self, action, reward, next_state, done): # Save experience / reward self.memory.add(self.last_state, action, reward, next_state, done) # Learn, if enough samples are available in memory if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) # Roll over last state and action self.last_state = next_state def act(self, states): """Returns actions for given state(s) as per current policy.""" state = np.reshape(states, [-1, self.state_size]) action = self.actor_local.model.predict(state)[0] return list(action + self.noise.sample()) # add some noise for exploration def learn(self, experiences): """Update policy and value parameters using given batch of experience tuples.""" # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.) states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape( -1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None ]).astype(np.float32).reshape(-1, 1) #rewards = np.interp(rewards, (rewards.min(), rewards.max()), (-1, +1)) #TESTING to scale rewards to a small number. dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack( [e.next_state for e in experiences if e is not None]) # Get predicted next-state actions and Q values from target models # Q_targets_next = critic_target(next_state, actor_target(next_state)) actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch( [next_states, actions_next]) # Compute Q targets for current states and train critic model (local) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) # Train actor model (local) action_gradients = np.reshape( self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) # custom training function # Soft-update target models self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) def save_weights(self): self.actor_local.model.save_weights("./weights/" + self.name + "_actor.h5") self.critic_local.model.save_weights("./weights/" + self.name + "_critic.h5") #Notice that after training over a batch of experiences, we could just copy our newly learned weights (from the local model) to the target model. #However, individual batches can introduce a lot of variance into the process, so it's better to perform a soft update, controlled by the parameter tau. def soft_update(self, local_model, target_model): """Soft update model parameters.""" local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len( target_weights ), "Local and target model parameters must have the same size" new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights)
class DDPG(): """Reinforcement Learning agent that learns using DDPG.""" def __init__(self, task, basename): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # learning rates self.actor_learning_rate = 0.0001 self.critic_learning_rate = 0.001 # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high, self.actor_learning_rate) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high, self.actor_learning_rate) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size, self.critic_learning_rate) self.critic_target = Critic(self.state_size, self.action_size, self.critic_learning_rate) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 1000000 self.batch_size = 128 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.001 # for soft update of target parameters # keep track of the best run self.nEpisode = 0 self.bestEpisode = [] self.bestEpisodeAt = -1 # logging business self.state_labels = self.task.get_state_labels() self.action_labels = [ 'ac{}'.format(i) for i in range(self.action_size) ] self.df_columns = [ 't' ] + self.state_labels.tolist() + self.action_labels + ['R'] self.basename = os.path.join('log', basename) self.currentEpisode = [] self.bestCumReward = -np.inf def reset_episode(self): self.noise.reset() self.last_state = self.task.reset() self.currentEpisode = [] return self.last_state def act(self, state): """Returns actions for given state(s) as per current policy.""" state = np.reshape(state, [-1, self.state_size]) action = self.actor_local.model.predict(state)[0] return list(action + self.noise.sample()) # add some noise for exploration def learn(self, experiences): """Update policy and value parameters using given batch of experience tuples.""" # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.) states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape( -1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None ]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack( [e.next_state for e in experiences if e is not None]) # Get predicted next-state actions and Q values from target models # Q_targets_next = critic_target(next_state, actor_target(next_state)) actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch( [next_states, actions_next]) # Compute Q targets for current states and train critic model (local) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) # Train actor model (local) action_gradients = np.reshape( self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) # custom training function # Soft-update target models self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) def soft_update(self, local_model, target_model): """Soft update model parameters.""" local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len(target_weights),\ "Local and target model parameters must have the same size" new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights) def step(self, action): last_state_variables = self.task.get_state_variables() last_t = self.task.sim.get_time() # call the model for state transition next_state, reward, done = self.task.step(action) # logging the current episode self.currentEpisode += [ np.hstack([last_t, last_state_variables, action, reward]) ] # Save experience / reward self.memory.add(self.last_state, action, reward, next_state, done) # Learn, if enough samples are available in memory if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) # Roll over last state and action self.last_state = next_state if done: # log the episode df = pd.DataFrame(self.currentEpisode, columns=self.df_columns) fn_i = '{}_{}'.format(self.basename, self.nEpisode) df.to_csv(fn_i + '.csv') cumR = df.R.sum() if len(df) > len(self.bestEpisode) or \ (len(df) == len(self.bestEpisode) and cumR > self.bestCumReward): self.bestCumReward = cumR self.bestEpisode = df self.bestEpisodeAt = self.nEpisode self.plot_episode(df, self.nEpisode, fn_i) sys.stdout.write( "\rEp#{:4d} dur_{} cumR_{:5.3f} best@{} dur_{} cumR_{:5.3f} ". format(self.nEpisode, len(self.bestEpisode), cumR, self.bestEpisodeAt, len(self.bestEpisode), self.bestCumReward)) self.nEpisode += 1 return next_state, done def train(self, num_episodes=1): for ep_i in range(num_episodes): state, done = self.reset_episode(), False while not done: action = self.act(state) state, done = self.step(action) def plot_episode(self, df, episNo, filename=''): fig = plt.figure(1) fig.clf() ax2 = fig.add_subplot(313) ax1 = fig.add_subplot(312, sharex=ax2) ax0 = fig.add_subplot(311, sharex=ax2) # plot selected state variables ax0.set_title('Ep#{} dur={:5.2f} sec'.format(episNo, df.t.iloc[-1])) df.plot(x='t', y=self.state_labels[:6], ax=ax0, style='.:') df.plot(x='t', y=self.state_labels[6:], ax=ax1, style='.:') df.plot(x='t', y=self.action_labels, ax=ax2, style='.:') df.plot(x='t', y='R', ax=ax2, secondary_y=True) plt.ylabel('Reward') plt.show() if len(filename) > 0: fig.savefig(filename)
class DDPG(): """Reinforcement learning agent who learns using DDPG""" def __init__(self,task): """Initialize models""" self.env = task self.state_size = task.observation_space.shape[0] self.action_size = task.action_space.shape[0] self.action_high = task.action_space.high self.action_low = task.action_space.low # Initialize Actor (policy) models self.actor_local = Actor(self.state_size,self.action_size,self.action_low,self.action_high) self.actor_target = Actor(self.state_size,self.action_size,self.action_low,self.action_high) # Initialize Critic (value) models self.critic_local = Critic(self.state_size,self.action_size) self.critic_target = Critic(self.state_size,self.action_size) # Initialize target model parameters with local model parameters self.actor_target.model.set_weights(self.actor_local.model.get_weights()) self.critic_target.model.set_weights(self.critic_local.model.get_weights()) # Noise process self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay buffer self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size,self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.001 # for soft update of target parameters def reset_episode(self,task): """Return state after reseting task""" self.noise.reset() state = task.reset() self.last_state = state return state def step(self,action,reward,next_state,done): # Add experience to memory self.memory.add_experience(self.last_state,action,reward,next_state,done) # Learn is memory is larger than batch size if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) # Roll over state self.last_state = next_state def act(self,state): """Returns action using the policy network """ state = np.reshape(state,[-1,self.state_size]) action = self.actor_local.model.predict(state)[0] return list(action+self.noise.sample()) def learn(self,experiences): # Convert experience tuples to separate arrays for each element states = np.vstack([e.state for e in experiences if e is not None]).astype(np.float32).reshape(-1,self.state_size) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape(-1,self.action_size) next_states = np.vstack([e.next_state for e in experiences if e is not None]).astype(np.float32).reshape(-1,self.state_size) rewards = np.array([e.reward for e in experiences if e is not None]).astype(np.float32).reshape(-1,1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1,1) # Get predicted next-state actions and Q values from target models # Q_targets_next = critic_target(next_state, actor_target(next_state)) actions_next = self.actor_target.model.predict(next_states) Q_targets_next = self.critic_target.model.predict([next_states,actions_next]) # Compute Q targets for current states and train critic model (local) Q_targets = rewards + self.gamma*Q_targets_next*(1-dones) self.critic_local.model.train_on_batch(x=[states,actions],y=Q_targets) # Train actor model (local) action_gradients = np.reshape(self.critic_local.get_action_gradients([states,actions,0]), [-1,self.action_size]) self.actor_local.train_fn([states,action_gradients,1]) # Soft-update target models self.soft_update(self.actor_local.model,self.actor_target.model) self.soft_update(self.critic_local.model,self.critic_target.model) def soft_update(self,local_model,target_model): local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len(target_weights), "Local and target model parameters must have the same size" new_weights = self.tau*local_weights + (1-self.tau)*target_weights target_model.set_weights(new_weights) def save_model(self,path): self.actor_local.model.save_weights(path) def load_model(self,path): self.actor_local.model.load_weights(path) def act_only(self,state): state = np.reshape(state,[-1,self.state_size]) action = self.actor_local.model.predict(state)[0] return list(action)
class Task(): def __init__(self, runtime=5., init_pose=np.array([0.0, 0.0, 10.0, 0.0, 0.0, 0.0]), init_velocities=np.array([0.0, 0.0, 0.0]), init_angle_velocities=np.array([0.0, 0.0, 0.0]), pos_noise=0.25, angle_noise=None, velocity_noise=0.15, velocity_angle_noise=None, target_pos=np.array([0.0, 0.0, 10.0])): self.target_pos = target_pos self.pos_noise = pos_noise self.angle_noise = angle_noise self.velocity_noise = velocity_noise self.velocity_angle_noise = velocity_angle_noise self.action_size = 1 self.action_repeat = 1 self.action_high = 1.2 * 400 self.action_low = 0.99 * 400 self.noise = OUNoise(self.action_size, mu=0.0, theta=0.2, sigma=0.1) self.action_b = (self.action_high + self.action_low) / 2.0 self.action_m = (self.action_high - self.action_low) / 2.0 # Simulation self.sim = PhysicsSim(init_pose, init_velocities, init_angle_velocities, runtime) self.state_size = len(self.get_state()) def get_reward(self): """Uses current pose of sim to return reward.""" # reward = np.tanh(1 - 0.7 * (abs(self.sim.pose[:3] - self.target_pos))).sum() # print("reward ", reward) # reward = np.square(self.sim.pose[:3] - self.target_pos).sum() # reward = np.sqrt(reward) # reward /=3 # print("\n") # print("self.sim.pose ", self.sim.pose) # print("self.target_pos ", self.target_pos) # np.clip(reward, 10, -10) # reward /= 10 # reward = 1.-.3*(abs(self.sim.pose[:3] - self.target_pos)).sum() # reward = np.tanh(1 - 0.003 * (abs(self.sim.pose[:3] - self.target_pos))).sum() # reward = np.tanh(1 - 0.3 * (abs(self.sim.pose[:3] - self.target_pos))).sum() # reward = (1.5 - np.sum(np.square(( self.sim.pose[:3] - self.target_pos) / 300.0))) * 2 # reward = np.tanh( 1.-0.3*(abs(self.sim.pose[:3] - self.target_pos)).sum()) # reward = (0.5 - np.mean(np.square((self.sim.pose[:3] - self.target_pos) / 200.0))) * 2 # reward = (0.5 - np.mean(np.square((self.sim.pose[:3] - self.target_pos) / 300.0))) * 2 # reward = 1. - .3 * (abs(self.sim.pose[:3] - self.target_pos)).sum() # if(self.sim.pose[2] >0) : # reward += 10 #abs(self.sim.pose[2] - self.target_pos[2]) # else: # reward -= 10 #abs(self.sim.pose[2] - self.target_pos[2]) # reward = np.tanh(reward) # reward = np.tanh(1 - np.mean(np.square(self.sim.pose[:3] - self.target_pos))) # reward=0 # if (self.sim.pose[2] > 0): # reward +=1 # if self.sim.pose[2] >= self.target_pos[2]: # reward += 5 # return reward # reward = self.sim.v[2] / 10.0 # reward += (self.sim.pose[2] - self.target_pos[2]) / 10.0 # reward -= np.linalg.norm(self.sim.pose[:2]) / 10.0 # return reward # # p1 = self.sim.pose[:3] # p2 = self.target_pos # env_bounds = 300.0 # bound = np.array([env_bounds, env_bounds, env_bounds]) # reward = (0.5 - np.mean(np.square((p1 - p2) / bound))) * 2 reward = np.tanh(1. - 0.76 * (abs(self.sim.pose[:3] - self.target_pos)).sum()) return reward def normalize_angles(self, angles): normalized_angles = np.copy(angles) for i in range(len(normalized_angles)): while normalized_angles[i] > np.pi: normalized_angles[i] -= 2 * np.pi return normalized_angles def get_state(self): position_error = (self.sim.pose[:3] - self.target_pos) return np.array( [position_error[2], self.sim.v[2], self.sim.linear_accel[2]]) def step(self, actionInput): reward = 0 # pose_all = [] for _ in range(self.action_repeat): action = actionInput action += self.noise.sample() action = np.clip(action, -1, 1) speed_of_rotors = (action * self.action_m) + self.action_b done = self.sim.next_timestep( speed_of_rotors * np.ones(4)) # update the sim pose and velocities reward += self.get_reward() next_state = self.get_state() if reward <= 0: done = True # pose_all.append(self.sim.pose) if self.sim.pose[2] >= self.target_pos[2]: reward += 1 # if self.sim.pose[2] >= self.target_pos[2]: # reward += 10 # else: # reward += -20 # if done: # if self.sim.time < self.sim.runtime: # reward += -1 # else: # reward += 5 # break # next_state = np.concatenate(pose_all) return next_state, reward, done def reset_noise(self): self.noise.reset() def reset(self): self.sim.reset() self.noise.reset() rnd_pos = np.copy(self.sim.init_pose) rnd_pos[2] += np.random.normal(0.0, self.pos_noise, 1) self.sim.pose = np.copy(rnd_pos) rnd_velocity = np.copy(self.sim.init_velocities) rnd_velocity[2] += np.random.normal(0.0, self.velocity_noise, 1) self.sim.v = np.copy(rnd_velocity) return self.get_state()