def __init__(self, env, state_dim=None): self.name = 'DDPG' # name for uploading results self.environment = env # Randomly initialize actor network and critic network # along with their target networks if state_dim: self.state_dim = state_dim print(self.state_dim) else: self.state_dim = env.observation_space.shape[0] self.action_dim = env.action_space.shape[0] self.sess = tf.InteractiveSession() self.actor_network = ActorNetwork(self.sess, self.state_dim, self.action_dim) self.critic_network = CriticNetwork(self.sess, self.state_dim, self.action_dim) # initialize replay buffer self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(self.action_dim) # Flag to signal save self.not_saved = True #For normalisation self.state_mean = 0 self.state_std = 1 self.target_mean = 0 self.target_std = 1
def __init__(self, target_model=None, train=True, replaybuffer=ReplayBuffer(HP.BUFFER_SIZE)): assert isinstance(target_model, DDPG) or target_model == None config = tf.ConfigProto() sess = tf.Session(config=config) K.set_session(sess) target_actor = target_model.actor if target_model != None else None target_critic = target_model.critic if target_model != None else None self.actor = ActorNetwork(**DDPG.actorParams(sess, target_actor)) self.critic = CriticNetwork(**DDPG.criticParams(sess, target_critic)) self.target_model = target_model self.replaybuffer = replaybuffer self.train = train self.epsilon = 1
def main(): with tf.Session() as sess: env = gym.make(ENV_NAME) np.random.seed(RANDOM_SEED) tf.set_random_seed(RANDOM_SEED) env.seed(RANDOM_SEED) # Check environment dimensions state_dim = env.observation_space.shape[0] action_dim = env.action_space.n action_bound = 1 # print("Sample Action: ") # print(env.action_space.sample()) # print("Sample Shape") # print(np.shape(env.action_space.sample())) # print("Valid Action") # val_act = np.array([[1.05],[0.5],[-1.3],[0.2]]) # print(env.action_space.contains(val_act)) # Ensure action bound is symmetric # assert (env.action_space.high == -env.action_space.low) # Build actor and critic networks actor = ActorNetwork(sess, state_dim, action_dim, action_bound, ACTOR_LEARNING_RATE, TAU) critic = CriticNetwork(sess, state_dim, action_dim, CRITIC_LEARNING_RATE, TAU, actor.get_num_trainable_vars()) # Film training videos if applicable # env = wrappers.Monitor(env, MONITOR_DIR, force=True, video_callable=lambda episode_id: episode_id%49==0) train(sess, env, actor, critic, RESTORE)
def __init__(self, env): self.name = 'DDPG' # name for uploading results self.environment = env # Randomly initialize actor network and critic network # along with their target networks self.state_dim = env.observation_space.shape[0] self.action_dim = env.action_space.shape[0] self.sess = tf.InteractiveSession() self.actor_network = ActorNetwork(self.sess, self.state_dim, self.action_dim) self.critic_network = CriticNetwork(self.sess, self.state_dim, self.action_dim) # initialize replay buffer self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(self.action_dim)
def main(): numHalls = 4 hallWidth = 1.5 hallLength = 20 turns = ['right', 'right', 'right', 'right'] car_dist_s = hallWidth / 2.0 car_dist_f = hallLength / 2.0 car_heading = 0 time_step = 0.1 with tf.Session() as sess: env = World(numHalls, hallWidth, hallLength, turns,\ car_dist_s, car_dist_f, car_heading, MAX_EP_STEPS,\ time_step, LIDAR_FIELD_OF_VIEW, LIDAR_NUM_RAYS,\ lidar_noise = LIDAR_NOISE, lidar_missing_rays = LIDAR_MISSING_RAYS) #np.random.seed(RANDOM_SEED) tf.set_random_seed(RANDOM_SEED) # Check environment dimensions state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] action_bound = env.action_space.high # Build actor and critic networks actor = ActorNetwork( sess, state_dim, action_dim, action_bound, MAX_ACTOR_LEARNING_RATE, TAU, layer1_size=l1size, layer2_size=l2size, ) critic = CriticNetwork(sess, state_dim, action_dim, MIN_CRITIC_LEARNING_RATE, TAU, actor.get_num_trainable_vars()) train(sess, env, actor, critic, RESTORE)
class DDPG: """docstring for DDPG""" def __init__(self, env): self.name = 'DDPG' # name for uploading results self.environment = env # Randomly initialize actor network and critic network # with both their target networks # state_dim = 2, action_dim = 2 self.state_dim = env.observation_space.shape[0] self.action_dim = env.action_space.shape[0] self.sess = tf.InteractiveSession() self.actor_network = ActorNetwork(self.sess, self.state_dim, self.action_dim) self.critic_network = CriticNetwork(self.sess, self.state_dim, self.action_dim) # initialize replay buffer self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(self.action_dim) def train(self): # print "train step",self.time_step # Sample a random minibatch of N transitions from replay buffer minibatch = self.replay_buffer.get_batch(BATCH_SIZE) state_batch = np.asarray([data[0] for data in minibatch]) action_batch = np.asarray([data[1] for data in minibatch]) reward_batch = np.asarray([data[2] for data in minibatch]) next_state_batch = np.asarray([data[3] for data in minibatch]) done_batch = np.asarray([data[4] for data in minibatch]) # for action_dim = 1 action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim]) # Calculate y_batch next_action_batch = self.actor_network.target_actions(next_state_batch) q_value_batch = self.critic_network.target_q(next_state_batch, next_action_batch) y_batch = [] for i in range(len(minibatch)): if done_batch[i]: y_batch.append(reward_batch[i]) else: y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i]) y_batch = np.resize(y_batch, [BATCH_SIZE, 1]) # Update critic by minimizing the loss L self.critic_network.train(y_batch, state_batch, action_batch) # Update the actor policy using the sampled gradient: action_batch_for_gradients = self.actor_network.actions(state_batch) q_gradient_batch = self.critic_network.gradients( state_batch, action_batch_for_gradients) self.actor_network.train(q_gradient_batch, state_batch) # Update the target networks self.actor_network.update_target() self.critic_network.update_target() return self.critic_network.q_value(state_batch, action_batch) def noise_action(self, state): # Select action a_t according to the current policy and exploration noise action = self.actor_network.action(state) return action + self.exploration_noise.noise() def action(self, state): action = self.actor_network.action(state) return action def perceive(self, state, action, reward, next_state, done): q_value = 0 # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer self.replay_buffer.add(state, action, reward, next_state, done) # Store transitions to replay start size then start training if self.replay_buffer.size() > REPLAY_START_SIZE: q_value = self.train() # if self.time_step % 10000 == 0: # self.actor_network.save_network(self.time_step) # self.critic_network.save_network(self.time_step) # Re-iniitialize the random process when an episode ends if done: self.exploration_noise.reset() return q_value
class DDPG: @staticmethod def actorParams(sess, target): return {'sess': sess, 'state_size':HP.STATE_DIM, 'action_size':HP.ACTION_DIM, 'tau':HP.ACTOR_TAU, 'lr':HP.ACTOR_LR, 'target': target} @staticmethod def criticParams(sess, target): return {'sess': sess, 'state_size':HP.STATE_DIM, 'action_size':HP.ACTION_DIM, 'tau':HP.CRITIC_TAU, 'lr':HP.CRITIC_LR, 'target': target} def __init__(self, target_model=None, train=True, replaybuffer=ReplayBuffer(HP.BUFFER_SIZE)): assert isinstance(target_model, DDPG) or target_model == None config = tf.ConfigProto() sess = tf.Session(config=config) K.set_session(sess) target_actor = target_model.actor if target_model != None else None target_critic = target_model.critic if target_model != None else None self.actor = ActorNetwork(**DDPG.actorParams(sess, target_actor)) self.critic = CriticNetwork(**DDPG.criticParams(sess, target_critic)) self.target_model = target_model self.replaybuffer = replaybuffer self.train = train self.epsilon = 1 def act(self, obs): action = self.actor.model.predict(obs.reshape(1,HP.STATE_DIM)) action = action * (HP.MAX_ACTION-HP.MIN_ACTION) + HP.MIN_ACTION if self.train and self.epsilon > 0: self.epsilon -= 1e-6 action = self.addOU(action) return action def addOU(self, action): return action + OU.ou(action, HP.OU_MEAN, HP.OU_THETA, HP.OU_SIGMA) def train_models(self): batch = self.replaybuffer.getBatch(HP.BATCH_SIZE) experiences = [np.asarray([i[j] for i in batch]) for j in range(5)] states, actions, rewards, nstates, dones = experiences target_q = self.compute_target_q(nstates, rewards, dones) loss = self.critic.model.train_on_batch([states, actions], target_q) a_for_grad = self.actor.model.predict(states) grads = self.critic.gradients(states, a_for_grad) self.actor.train(states, grads) self.actor.target_train() self.critic.target_train() def remember(self, obs, action, reward, next_obs, done): self.replaybuffer.add(obs, action, reward, next_obs, done) def compute_target_q(self, nstates, rewards, dones): target_q_values = self.target_model.critic.model.predict([nstates, \ self.target_model.actor.model.predict(nstates)]) y_t = np.zeros(len(nstates)) for idx, reward in enumerate(rewards): y_t[idx] = reward if not dones[idx]: y_t += HP.GAMMA*target_q_values[idx] return y_t def copy_from_target(self): self.actor.copy_from_target() self.critic.copy_from_target() def save(self, location, epoch): self.actor.model.save(location+'/actor_model_{}.h5'.format(epoch)) self.critic.model.save(location+'/critic_model_{}.h5'.format(epoch))
class DDPG: def __init__(self, env, state_dim=None): self.name = 'DDPG' # name for uploading results self.environment = env # Randomly initialize actor network and critic network # along with their target networks if state_dim: self.state_dim = state_dim print(self.state_dim) else: self.state_dim = env.observation_space.shape[0] self.action_dim = env.action_space.shape[0] self.sess = tf.InteractiveSession() self.actor_network = ActorNetwork(self.sess, self.state_dim, self.action_dim) self.critic_network = CriticNetwork(self.sess, self.state_dim, self.action_dim) # initialize replay buffer self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(self.action_dim) # Flag to signal save self.not_saved = True #For normalisation self.state_mean = 0 self.state_std = 1 self.target_mean = 0 self.target_std = 1 def train(self): #print "train step",self.time_step # Sample a random minibatch of N transitions from replay buffer minibatch = self.replay_buffer.get_batch(BATCH_SIZE) state_batch = np.asarray([data[0] for data in minibatch]) action_batch = np.asarray([data[1] for data in minibatch]) reward_batch = np.asarray([data[2] for data in minibatch]) next_state_batch = np.asarray([data[3] for data in minibatch]) done_batch = np.asarray([data[4] for data in minibatch]) #For Normalisation states = np.array(state_batch) targets = np.array(next_state_batch) self.state_mean = states.mean(axis=0) self.state_std = states.std(axis=0) + 0.00000001 self.target_mean = targets.mean(axis=0) self.target_std = targets.std(axis=0) + 0.00000001 states = (state_batch - self.state_mean) / self.state_std targets = (next_state_batch - self.target_mean) / self.target_std state_batch = states.tolist() next_state_batch = targets.tolist() # for action_dim = 1 action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim]) # Calculate y_batch next_action_batch = self.actor_network.target_actions(next_state_batch) q_value_batch = self.critic_network.target_q(next_state_batch, next_action_batch) y_batch = [] for i in range(len(minibatch)): if done_batch[i]: y_batch.append(reward_batch[i]) else: y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i]) y_batch = np.resize(y_batch, [BATCH_SIZE, 1]) # Update critic by minimizing the loss L self.critic_network.train(y_batch, state_batch, action_batch) # Update the actor policy using the sampled gradient: action_batch_for_gradients = self.actor_network.actions(state_batch) q_gradient_batch = self.critic_network.gradients( state_batch, action_batch_for_gradients) self.actor_network.train(q_gradient_batch, state_batch) # Update the target networks self.actor_network.update_target() self.critic_network.update_target() def noise_action(self, state): # Select action a_t according to the current policy and exploration noise # Normalising first state = np.array(state) state = (state - self.state_mean) / self.state_std state = state.tolist() action = self.actor_network.action(state) # print ("State-: ", state) # print ("Action-: ", action) return action + self.exploration_noise.noise() def action(self, state): # Normalising first state = np.array(state) state = (state - self.state_mean) / self.state_std state = state.tolist() # Taking action action = self.actor_network.action(state) return action def perceive(self, state, action, reward, next_state, done, episode): # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer self.replay_buffer.add(state, action, reward, next_state, done) # Store transitions to replay start size then start training if self.replay_buffer.count() > REPLAY_START_SIZE: self.train() self.time_step = self.critic_network.time_step if episode % 20 == 0: #self.time_step % 400 == 0: if self.not_saved: self.actor_network.save_network(episode) #(self.time_step) self.critic_network.save_network(episode) #(self.time_step) self.not_saved = False else: self.not_saved = True # Re-iniitialize the random process when an episode ends if done: self.exploration_noise.reset()