class DDPG: def __init__(self, state_dim, state_channel, action_dim): self.state_dim = state_dim self.state_channel = state_channel self.action_dim = action_dim self.sess = tf.InteractiveSession() self.state_input = tf.placeholder('float', [None, state_dim, state_dim, state_channel]) self.target_state_input = tf.placeholder('float', [None, state_dim, state_dim, state_channel]) self.action_input = tf.placeholder('float', [None, action_dim]) self.actor_network = ActorNetwork(self.sess, self.state_dim, self.state_channel, self.action_dim) self.critic_network = CriticNetwork(self.sess, self.state_dim, self.state_channel, self.action_dim) # create network self.actor_network.create_network(self.state_input) self.critic_network.create_q_network(self.state_input, self.actor_network.action_output) # create target network self.actor_network.create_target_network(self.target_state_input) self.critic_network.create_target_q_network(self.target_state_input, self.actor_network.target_action_output) # create training method self.actor_network.create_training_method(self.critic_network.q_value_output) self.critic_network.create_training_method() self.sess.run(tf.initialize_all_variables()) self.actor_network.update_target() self.critic_network.update_target() self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) self.exploration_noise = OUNoise(self.action_dim) self.dir_path = os.path.dirname(os.path.realpath(__file__)) + '/models_ddpg' if not os.path.exists(self.dir_path): os.mkdir(self.dir_path) # for log self.reward_input = tf.placeholder(tf.float32) tf.scalar_summary('reward', self.reward_input) self.time_input = tf.placeholder(tf.float32) tf.scalar_summary('living_time', self.time_input) self.summary_op = tf.merge_all_summaries() self.summary_writer = tf.train.SummaryWriter(self.dir_path + '/log', self.sess.graph) self.episode_reward = 0.0 self.episode_start_time = 0.0 self.time_step = 1 self.saver = tf.train.Saver(tf.all_variables()) self.load_time_step() self.load_network() return def train(self): action_dim = self.action_dim minibatch = self.replay_buffer.get_batch(BATCH_SIZE) # sample BATCH_SIZE from replay_buffer state_batch = np.asarray([data[0] for data in minibatch]) action_batch = np.asarray([data[1] for data in minibatch]) reward_batch = np.asarray([data[2] for data in minibatch]) next_state_batch = np.asarray([data[3] for data in minibatch]) done_batch = np.asarray([data[4] for data in minibatch]) # if action_dim = 1, it's a number not a array action_batch = np.resize(action_batch, [BATCH_SIZE, action_dim]) # calculate y_batch via target network next_action_batch = self.actor_network.target_actions(next_state_batch) q_value_batch = self.critic_network.target_q_value(next_state_batch, next_action_batch) y_batch = [] for i in range(BATCH_SIZE): if done_batch[i]: y_batch.append(reward_batch[i]) else: y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i]) y_batch = np.resize(y_batch, [BATCH_SIZE, 1]) # print np.shape(reward_batch), np.shape(y_batch) # train actor network self.actor_network.train(state_batch) # train critic network self.critic_network.train(y_batch, state_batch, action_batch) # update target network self.actor_network.update_target() self.critic_network.update_target() return def noise_action(self, state): action = self.actor_network.action(state) return action + self.exploration_noise.noise() def action(self, state): action = self.actor_network.action(state) return action def _record_log(self, reward, living_time): summary_str = self.sess.run(self.summary_op, feed_dict={ self.reward_input: reward, self.time_input: living_time }) self.summary_writer.add_summary(summary_str, self.time_step) return def perceive(self, state, action, reward, next_state, done): self.replay_buffer.add(state, action, reward, next_state, done) if self.episode_start_time == 0.0: self.episode_start_time = time.time() # for testing # self.time_step += 1 # if self.time_step == 100: # print '--------------------------------' # self.replay_buffer.save_to_pickle() # return self.episode_reward += reward living_time = time.time() - self.episode_start_time if self.time_step % 1000 == 0 or done: self._record_log(self.episode_reward, living_time) if self.replay_buffer.size() > REPLAY_START_SIZE: self.train() if self.time_step % 100000 == 0: self.save_network() if done: print '===============reset noise=========================' self.exploration_noise.reset() self.episode_reward = 0.0 self.episode_start_time = time.time() self.time_step += 1 return def load_time_step(self): if not os.path.exists(self.dir_path): return files = os.listdir(self.dir_path) step_list = [] for filename in files: if ('meta' in filename) or ('-' not in filename): continue step_list.append(int(filename.split('-')[-1])) step_list = sorted(step_list) if len(step_list) == 0: return self.time_step = step_list[-1] + 1 return def load_network(self): checkpoint = tf.train.get_checkpoint_state(self.dir_path) if checkpoint and checkpoint.model_checkpoint_path: self.saver.restore(self.sess, checkpoint.model_checkpoint_path) print 'Successfully loaded:', checkpoint.model_checkpoint_path else: print 'Could not find old network weights' return def save_network(self): print 'save actor-critic network...', self.time_step self.saver.save(self.sess, self.dir_path + '/ddpg', global_step=self.time_step) return
class DDPG: """docstring for DDPG""" def __init__(self, env_name, state_dim, action_dim): self.name = 'DDPG' # name for uploading results self.env_name = env_name # Randomly initialize actor network and critic network # with both their target networks self.state_dim = state_dim self.action_dim = action_dim # Ensure action bound is symmetric self.time_step = 0 self.sess = tf.InteractiveSession() self.actor_network = ActorNetwork(self.sess, self.state_dim, self.action_dim) self.critic_network = CriticNetwork(self.sess, self.state_dim, self.action_dim) # initialize replay buffer self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.OU = OU() # loading networks self.saver = tf.train.Saver() checkpoint = tf.train.get_checkpoint_state(save_location) if checkpoint and checkpoint.model_checkpoint_path: self.saver.restore(self.sess, checkpoint.model_checkpoint_path) print("Successfully loaded:", checkpoint.model_checkpoint_path) else: print("Could not find old network weights") def train(self): #print "train step",self.time_step # Sample a random minibatch of N transitions from replay buffer minibatch = self.replay_buffer.getBatch(BATCH_SIZE) state_batch = np.asarray([data[0] for data in minibatch]) action_batch = np.asarray([data[1] for data in minibatch]) reward_batch = np.asarray([data[2] for data in minibatch]) next_state_batch = np.asarray([data[3] for data in minibatch]) done_batch = np.asarray([data[4] for data in minibatch]) # for action_dim = 1 action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim]) # Calculate y_batch next_action_batch = self.actor_network.target_actions(next_state_batch) q_value_batch = self.critic_network.target_q(next_state_batch, next_action_batch) y_batch = [] for i in range(len(minibatch)): if done_batch[i]: y_batch.append(reward_batch[i]) else: y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i]) y_batch = np.resize(y_batch, [BATCH_SIZE, 1]) # Update critic by minimizing the loss L self.critic_network.train(y_batch, state_batch, action_batch) # Update the actor policy using the sampled gradient: action_batch_for_gradients = self.actor_network.actions(state_batch) q_gradient_batch = self.critic_network.gradients( state_batch, action_batch_for_gradients) self.actor_network.train(q_gradient_batch, state_batch) # Update the target networks self.actor_network.update_target() self.critic_network.update_target() def saveNetwork(self): self.saver.save(self.sess, save_location + self.env_name + 'network' + '-ddpg', global_step=self.time_step) def action(self, state): action = self.actor_network.action(state) action[0] = np.clip(action[0], -1, 1) action[1] = np.clip(action[1], 0, 1) action[2] = np.clip(action[2], 0, 1) #print "Action:", action return action def noise_action(self, state, epsilon): # Select action a_t according to the current policy and exploration noise action = self.actor_network.action(state) #print action.shape #print "Action_No_Noise:", action noise_t = np.zeros(self.action_dim) noise_t[0] = epsilon * self.OU.function(action[0], 0.0, 0.60, 0.80) noise_t[1] = epsilon * self.OU.function(action[1], 0.5, 1.00, 0.10) noise_t[2] = epsilon * self.OU.function(action[2], -0.1, 1.00, 0.05) if random.random() <= 0.01: # 0.1 print("********Stochastic brake***********") noise_t[2] = epsilon * self.OU.function(action[2], 0.2, 1.00, 0.10) action = action + noise_t action[0] = np.clip(action[0], -1, 1) action[1] = np.clip(action[1], 0, 1) action[2] = np.clip(action[2], 0, 1) #print "Action_Noise:", action return action def perceive(self, state, action, reward, next_state, done): # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer if (not (math.isnan(reward))): self.replay_buffer.add(state, action, reward, next_state, done) self.time_step = self.time_step + 1 # Store transitions to replay start size then start training if self.replay_buffer.count() > REPLAY_START_SIZE: self.train()
class DDPG: """docstring for DDPG""" def __init__(self, environment): self.name = 'DDPG' # name for uploading results self.environment = environment # Randomly initialize actor network and critic network # with both their target networks self.actor_network = ActorNetwork(state_size = environment.observation_space.shape[0],action_size = environment.action_space.shape[0]) self.critic_network = CriticNetwork(state_size = environment.observation_space.shape[0],action_size = environment.action_space.shape[0]) # initialize replay buffer self.replay_buffer = deque() # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(environment.action_space.shape[0]) # Initialize time step self.time_step = 0 def set_init_observation(self,observation): # receive initial observation state self.state = observation def train(self): # Sample a random minibatch of N transitions from replay buffer minibatch = random.sample(self.replay_buffer,BATCH_SIZE) state_batch = [data[0] for data in minibatch] action_batch = [data[1] for data in minibatch] reward_batch = [data[2] for data in minibatch] next_state_batch = [data[3] for data in minibatch] action_batch = np.resize(action_batch,[BATCH_SIZE,1]) # Calculate y y_batch = [] next_action_batch = self.actor_network.target_evaluate(next_state_batch) q_value_batch = self.critic_network.target_evaluate(next_state_batch,next_action_batch) for i in range(0,BATCH_SIZE): done = minibatch[i][4] if done: y_batch.append(reward_batch[i]) else: y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i]) # Update critic by minimizing the loss L self.critic_network.train(y_batch,state_batch,action_batch) # Update the actor policy using the sampled gradient: action_batch_for_gradients = self.actor_network.evaluate(state_batch) q_gradient_batch = self.critic_network.gradients(state_batch,action_batch_for_gradients)/BATCH_SIZE self.actor_network.train(q_gradient_batch,state_batch) # Update the target networks self.actor_network.update_target() self.critic_network.update_target() def get_action(self): # Select action a_t according to the current policy and exploration noise action = self.actor_network.get_action(self.state) return np.clip(action+self.exploration_noise.noise(),self.environment.action_space.low,self.environment.action_space.high) def set_feedback(self,observation,action,reward,done): # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer next_state = observation self.replay_buffer.append((self.state,action,reward,next_state,done)) # Update current state self.state = next_state # Update time step self.time_step += 1 # Limit the replay buffer size if len(self.replay_buffer) > REPLAY_BUFFER_SIZE: self.replay_buffer.popleft() # Store transitions to replay start size then start training if self.time_step > REPLAY_START_SIZE: self.train() if self.time_step % 10000 == 0: self.actor_network.save_network(self.time_step) self.critic_network.save_network(self.time_step) # Re-iniitialize the random process when an episode ends if done: self.exploration_noise.reset()
class DDPG: """docstring for DDPG""" def __init__(self, env, DIRECTORY): self.batch_size = BATCH_SIZE self.replay_start_size = REPLAY_START_SIZE # self.sub_batch_size = BATCH_SIZE / n_gpu self.name = 'DDPG' # name for uploading results self.environment = env # Randomly initialize actor network and critic network # with both their target networks self.state_dim = env.observation_space.shape[0] self.action_dim = env.action_space.shape[0] self.sess = tf.InteractiveSession(config=tf.ConfigProto( allow_soft_placement=True, log_device_placement=False)) self.trace_length = TRACE_LENGTH self.temp_abstract = TEMP_ABSTRACT self.actor_network = ActorNetwork(self.sess, BATCH_SIZE, self.state_dim, self.action_dim, self.temp_abstract, DIRECTORY) self.critic_network = CriticNetwork(self.sess, BATCH_SIZE, self.state_dim, self.action_dim, self.temp_abstract, DIRECTORY) # initialize replay buffer max_len_trajectory = self.environment.spec.timestep_limit + 1 # trace_length self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE, DIRECTORY, max_len_trajectory, self.actor_network.last_epi) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(self.action_dim) ### self.diff = 0. self.discounting_mat_dict = {} ### def state_initialiser(self, shape, mode='g'): if mode == 'z': #Zero initial = np.zeros(shape=shape) elif mode == 'g': #Gaussian # initial = stats.truncnorm.rvs(a=-0.02/0.01,b=0.02/0.01,loc=0.,scale=0.01,size=shape) initial = np.random.normal(loc=0., scale=1. / float(shape[1]), size=shape) else: # May do some adaptive initialiser can be built in later raise NotImplementedError return initial def train(self, time_step): #,time_step): ###1) Get-batch data for opt minibatch, trace_length = self.replay_buffer.get_batch( self.batch_size, self.trace_length, time_step) #, self.trace_length) try: state_trace_batch = np.stack(minibatch[:, :, 2].ravel()).reshape( self.batch_size, trace_length, self.state_dim) action_trace_batch = np.stack(minibatch[:, :, 3].ravel()).reshape( self.batch_size, trace_length, self.action_dim) next_state_batch = np.stack(minibatch[:, -1, 6].ravel()).reshape( self.batch_size, 1, self.state_dim) next_state_trace_batch = np.concatenate( [state_trace_batch, next_state_batch], axis=1) reward_trace_batch = np.stack(minibatch[:, :, 4].ravel()).reshape( self.batch_size, trace_length, 1) done_trace_batch = np.stack(minibatch[:, :, 7].ravel()).reshape( self.batch_size, trace_length, 1) except Exception as e: print(str(e)) raise ###2) Painfully initialise initial memories of LSTMs: not super-efficient, but no error guaranteed from tf's None-type zero-state problem init_actor_hidden1_cORm_batch = self.state_initialiser( shape=(self.batch_size, self.actor_network.rnn_size), mode='z') actor_init_h_batch = ( init_actor_hidden1_cORm_batch, init_actor_hidden1_cORm_batch ) #((init_hidden1_cORm_batch,init_hidden1_cORm_batch),(init_actor_hidden2_cORm_batch,init_actor_hidden2_cORm_batch)) init_critic_hidden1_cORm_batch = self.state_initialiser( shape=(self.batch_size, self.critic_network.rnn_size), mode='z') critic_init_h_batch = ( init_critic_hidden1_cORm_batch, init_critic_hidden1_cORm_batch ) #,(init_critic_hidden3_cORm_batch,init_critic_hidden3_cORm_batch)) ### self.dt_list = np.zeros(shape=(15, )) self.dt_list[-1] = time.time() if trace_length <= OPT_LENGTH: target_actor_init_h_batch = actor_init_h_batch target_critic_init_h_batch = critic_init_h_batch pass else: ### memory stuff actor_init_h_batch = self.actor_network.action( state_trace_batch[:, :-OPT_LENGTH, :], actor_init_h_batch, mode=1) target_actor_init_h_batch = actor_init_h_batch critic_init_h_batch = self.critic_network.evaluation( state_trace_batch[:, :-OPT_LENGTH, :], action_trace_batch[:, :-OPT_LENGTH, :], critic_init_h_batch, mode=1) target_critic_init_h_batch = critic_init_h_batch state_trace_batch = state_trace_batch[:, -OPT_LENGTH:, :] next_state_trace_batch = next_state_trace_batch[:, -(OPT_LENGTH + 1):, :] action_trace_batch = action_trace_batch[:, -OPT_LENGTH:, :] reward_trace_batch = reward_trace_batch[:, -OPT_LENGTH:, :] done_trace_batch = done_trace_batch[:, -OPT_LENGTH:, :] self.dt_list[0] = time.time() - np.sum(self.dt_list) ###3) Obtain target output next_action_batch = self.actor_network.target_action( next_state_trace_batch, init_temporal_hidden_cm_batch=target_actor_init_h_batch) self.dt_list[1] = time.time() - np.sum(self.dt_list) next_action_trace_batch = np.concatenate( [action_trace_batch, np.expand_dims(next_action_batch, axis=1)], axis=1) self.dt_list[2] = time.time() - np.sum(self.dt_list) target_lastQ_batch = self.critic_network.target_q_trace( next_state_trace_batch, next_action_trace_batch, init_temporal_hidden_cm_batch=target_critic_init_h_batch) self.dt_list[3] = time.time() - np.sum(self.dt_list) # Control the length of time-step for gradient if trace_length <= OPT_LENGTH: update_length = np.minimum( trace_length, OPT_LENGTH // 1) #//denom: 2(opt1) #1(opt0) #OPT_LENGTH(opt2) else: update_length = OPT_LENGTH // 1 #//denom: 2(opt1) #1(opt0) #OPT_LENGTH(opt2) target_lastQ_batch_masked = target_lastQ_batch * ( 1. - done_trace_batch[:, -1]) rQ = np.concatenate([ np.squeeze(reward_trace_batch[:, -update_length:], axis=-1), target_lastQ_batch_masked ], axis=1) self.dt_list[4] = time.time() - np.sum(self.dt_list) try: discounting_mat = self.discounting_mat_dict[update_length] except KeyError: discounting_mat = np.zeros(shape=(update_length, update_length + 1), dtype=np.float) for i in range(update_length): discounting_mat[i, :i] = 0. discounting_mat[i, i:] = GAMMA**np.arange(0., -i + update_length + 1) discounting_mat = np.transpose(discounting_mat) self.discounting_mat_dict[update_length] = discounting_mat try: y_trace_batch = np.expand_dims(np.matmul(rQ, discounting_mat), axis=-1) except Exception as e: print('?') raise self.dt_list[5] = time.time() - np.sum(self.dt_list) ###4)Train Critic: get next_action, target_q, then optimise critic_grad = self.critic_network.train( y_trace_batch, update_length, state_trace_batch, action_trace_batch, init_temporal_hidden_cm_batch=critic_init_h_batch) self.dt_list[6] = time.time() - np.sum(self.dt_list) ###5) Train Actor: while updated critic, we declared the dQda. Hence sess,run(dQda*dadParam_actor), then optimise actor for i in range(update_length): actor_init_h_batch_trace = (np.expand_dims(actor_init_h_batch[0], axis=1), np.expand_dims(actor_init_h_batch[1], axis=1)) critic_init_h_batch_trace = (np.expand_dims(critic_init_h_batch[0], axis=1), np.expand_dims(critic_init_h_batch[1], axis=1)) if i == 0: actor_init_h_batch_stack = actor_init_h_batch_trace critic_init_h_batch_stack = critic_init_h_batch_trace else: actor_init_h_batch_stack = (np.concatenate( (actor_init_h_batch_stack[0], actor_init_h_batch_trace[0]), axis=1), np.concatenate( (actor_init_h_batch_stack[1], actor_init_h_batch_trace[1]), axis=1)) critic_init_h_batch_stack = ( np.concatenate((critic_init_h_batch_stack[0], critic_init_h_batch_trace[0]), axis=1), np.concatenate((critic_init_h_batch_stack[1], critic_init_h_batch_trace[1]), axis=1)) action_trace_batch_for_gradients, actor_init_h_batch = self.actor_network.action_trace( np.expand_dims(state_trace_batch[:, i], 1), init_temporal_hidden_cm_batch=actor_init_h_batch) critic_init_h_batch = self.critic_network.evaluation_trace( np.expand_dims(state_trace_batch[:, i], 1), np.expand_dims(action_trace_batch[:, i], 1), init_temporal_hidden_cm_batch=critic_init_h_batch) if i == 0: action_trace_batch_for_gradients_stack = action_trace_batch_for_gradients else: action_trace_batch_for_gradients_stack = np.concatenate( (action_trace_batch_for_gradients_stack, action_trace_batch_for_gradients), axis=1) self.dt_list[7] = time.time() - np.sum(self.dt_list) state_trace_batch_stack = np.reshape( state_trace_batch, (self.batch_size * update_length, 1, self.state_dim)) action_trace_batch_stack = np.reshape( action_trace_batch, (self.batch_size * update_length, 1, self.action_dim)) action_trace_batch_for_gradients_stack = np.reshape( action_trace_batch_for_gradients_stack, (self.batch_size * update_length, 1, self.action_dim)) actor_init_h_batch_stack = (np.reshape( actor_init_h_batch_stack[0], (self.batch_size * update_length, self.actor_network.rnn_size)), np.reshape( actor_init_h_batch_stack[1], (self.batch_size * update_length, self.actor_network.rnn_size))) critic_init_h_batch_stack = (np.reshape( critic_init_h_batch_stack[0], (self.batch_size * update_length, self.critic_network.rnn_size)), np.reshape( critic_init_h_batch_stack[1], (self.batch_size * update_length, self.critic_network.rnn_size))) q_gradient_trace_batch = self.critic_network.gradients( 1, state_trace_batch_stack, action_trace_batch_for_gradients_stack, init_temporal_hidden_cm_batch=critic_init_h_batch_stack) self.dt_list[8] = time.time() - np.sum(self.dt_list) # Update the actor policy using the sampled gradient: actor_grad = self.actor_network.train( q_gradient_trace_batch, 1, state_trace_batch_stack, action_trace_batch_stack, init_temporal_hidden_cm_batch=actor_init_h_batch_stack) self.dt_list[9] = time.time() - np.sum(self.dt_list) # Update the target networks via EMA & Indicators # self.critic_network.update_target() self.dt_list[10] = time.time() - np.sum(self.dt_list) # self.actor_network.update_target() self.dt_list[11] = time.time() - np.sum(self.dt_list) # actor_diff = self.actor_network.get_diff() self.dt_list[12] = time.time() - np.sum(self.dt_list) # critic_diff = self.critic_network.get_diff() self.dt_list[13] = time.time() - np.sum(self.dt_list) self.dt_list = np.delete(self.dt_list, -1) return actor_grad, critic_grad, # actor_diff, actor_grad, critic_diff, critic_grad def action(self, state_trace, init_hidden_cm, epi, noisy=True): # Select action a_t according to the current policy and exploration noise action, last_hidden_cm = self.actor_network.action([state_trace], init_hidden_cm, mode=2) if noisy: noise = self.exploration_noise.noise() #epi) return action + noise, last_hidden_cm #, dt#, np.linalg.norm(noise) else: return action, last_hidden_cm def evaluation(self, state_trace, action_trace, action_last, init_hidden_cm): return self.critic_network.evaluation([state_trace], [action_trace], action_last, init_hidden_cm, mode=2) #q_value, last_hidden_cm # def perceive(self,actor_init_hidden_cm,critic_last_hidden_cm,state,action,reward,next_state,done,time_step,epi): def perceive(self, state, action, reward, next_state, done, time_step, epi): # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer # self.replay_buffer.add(actor_init_hidden_cm,critic_last_hidden_cm,state,action,reward,next_state,done,epi) done = float(done) self.replay_buffer.add(state, action, reward, next_state, done, epi, time_step) # Store transitions to replay start size then start training if (self.replay_buffer.num_experiences > REPLAY_START_SIZE): # Non-zero diff should be found self.actor_grad, self.critic_grad = self.train(time_step) # self.actor_diff, self.actor_grad, self.critic_diff, self.critic_grad = self.train(time_step) else: # Zero diff as is not trained # self.actor_diff = 0. self.actor_grad = 0. # self.critic_diff = 0. self.critic_grad = 0. # Re-iniitialize the random process when an episode ends if done: self.exploration_noise.reset()
def run_ddpg(amodel, cmodel, train_indicator=0, seeded=1337, track_name='practgt2.xml'): OU = FunctionOU() BUFFER_SIZE = 100000 BATCH_SIZE = 32 GAMMA = 0.99 TAU = 0.001 # Target Network HyperParameters LRA = 0.0001 # Learning rate for Actor LRC = 0.001 # Lerning rate for Critic ALPHA = 0.9 action_dim = 3 # Steering/Acceleration/Brake state_dim = 29 # of sensors input np.random.seed(seeded) vision = False EXPLORE = 100000. if train_indicator: episode_count = 600 else: episode_count = 3 max_steps = 20000 reward = 0 done = False step = 0 epsilon = 1 indicator = 0 # Tensorflow GPU optimization config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) from keras import backend as K K.set_session(sess) actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA) critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC) buff = ReplayBuffer(BUFFER_SIZE) # Create replay buffer # Generate a Torcs environment env = TorcsEnv(vision=vision, throttle=True, gear_change=False, track_name=track_name) if not train_indicator: # Now load the weight #logging.info("Now we load the weight") print("Now we load the weight") try: actor.model.load_weights(amodel) critic.model.load_weights(cmodel) actor.target_model.load_weights(amodel) critic.target_model.load_weights(cmodel) #logging.info(" Weight load successfully") print("Weight load successfully") except: #ogging.info("Cannot find the weight") print("Cannot find the weight") exit() #logging.info("TORCS Experiment Start.") print("TORCS Experiment Start.") best_lap = 500 for i_episode in range(episode_count): print("Episode : " + str(i_episode) + " Replay Buffer " + str(buff.count())) #logging.info("Episode : " + str(i_episode) + " Replay Buffer " + str(buff.count())) if np.mod(i_episode, 3) == 0: ob = env.reset( relaunch=True ) # relaunch TORCS every 3 episode because of the memory leak error else: ob = env.reset() s_t = np.hstack((ob.speedX, ob.angle, ob.trackPos, ob.speedY, ob.speedZ, ob.rpm, ob.wheelSpinVel / 100.0, ob.track)) total_reward = 0. for j_iter in range(max_steps): loss = 0 epsilon -= 1.0 / EXPLORE a_t = np.zeros([1, action_dim]) noise_t = np.zeros([1, action_dim]) a_t_original = actor.model.predict(s_t.reshape(1, s_t.shape[0])) noise_t[0][0] = train_indicator * max(epsilon, 0) * OU.function( a_t_original[0][0], 0.0, 0.60, 0.30) noise_t[0][1] = train_indicator * max(epsilon, 0) * OU.function( a_t_original[0][1], 0.5, 1.00, 0.10) noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function( a_t_original[0][2], -0.1, 1.00, 0.05) a_t[0][0] = a_t_original[0][0] + noise_t[0][0] a_t[0][1] = a_t_original[0][1] + noise_t[0][1] a_t[0][2] = a_t_original[0][2] + noise_t[0][2] ob, r_t, done, info = env.step(a_t[0]) s_t1 = np.hstack( (ob.speedX, ob.angle, ob.trackPos, ob.speedY, ob.speedZ, ob.rpm, ob.wheelSpinVel / 100.0, ob.track)) buff.add(s_t, a_t[0], r_t, s_t1, done) # Add replay buffer # Do the batch update batch = buff.getBatch(BATCH_SIZE) states = np.asarray([e[0] for e in batch]) actions = np.asarray([e[1] for e in batch]) rewards = np.asarray([e[2] for e in batch]) new_states = np.asarray([e[3] for e in batch]) dones = np.asarray([e[4] for e in batch]) y_t = np.asarray([e[1] for e in batch]) target_q_values = critic.target_model.predict( [new_states, actor.target_model.predict(new_states)]) for k in range(len(batch)): if dones[k]: y_t[k] = rewards[k] else: y_t[k] = rewards[k] + GAMMA * target_q_values[k] if train_indicator: loss += critic.model.train_on_batch([states, actions], y_t) a_for_grad = actor.model.predict(states) grads = critic.gradients(states, a_for_grad) actor.train(states, grads) actor.target_train() critic.target_train() total_reward += r_t s_t = s_t1 print("Episode", i_episode, "Step", step, "Action", a_t, "Reward", r_t, "Loss", loss) if np.mod(step, 1000) == 0: logging.info("Episode {}, Distance {}, Last Lap {}".format( i_episode, ob.distRaced, ob.lastLapTime)) if ob.lastLapTime > 0: if best_lap < ob.lastLapTime: best_lap = ob.lastLapTime step += 1 if done: break if train_indicator and i_episode > 20: if np.mod(i_episode, 3) == 0: logging.info("Now we save model") actor.model.save_weights("ddpg_actor_weights_periodic.h5", overwrite=True) critic.model.save_weights("ddpg_critic_weights_periodic.h5", overwrite=True) print("TOTAL REWARD @ " + str(i_episode) + "-th Episode : Reward " + str(total_reward)) print("Total Step: " + str(step)) print("Best Lap {}".format(best_lap)) print("") logging.info("TOTAL REWARD @ " + str(i_episode) + "-th Episode : Reward " + str(total_reward)) logging.info("Best Lap {}".format(best_lap)) env.end() # This is for shutting down TORCS logging.info("Finish.")
class DDPG: """docstring for DDPG""" def __init__(self, env, results_file): self.name = 'DDPG' # name for uploading results self.environment = env # Randomly initialize actor network and critic network # with both their target networks self.state_dim = env.observation_space.shape[0] self.action_dim = env.action_space.shape[0] self.sess = tf.InteractiveSession() self.actor_network = ActorNetwork(self.sess, self.state_dim, self.action_dim) self.critic_network = CriticNetwork(self.sess, self.state_dim, self.action_dim) # initialize replay buffer self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(self.action_dim) results_file.write(ActorNetwork.get_settings()) def train(self): #print "train step",self.time_step # Sample a random minibatch of N transitions from replay buffer minibatch = self.replay_buffer.get_batch(BATCH_SIZE) state_batch = np.asarray([data[0] for data in minibatch]) action_batch = np.asarray([data[1] for data in minibatch]) reward_batch = np.asarray([data[2] for data in minibatch]) next_state_batch = np.asarray([data[3] for data in minibatch]) done_batch = np.asarray([data[4] for data in minibatch]) # for action_dim = 1 action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim]) # Calculate y_batch next_action_batch = self.actor_network.target_actions(next_state_batch) q_value_batch = self.critic_network.target_q(next_state_batch, next_action_batch) y_batch = [] for i in range(len(minibatch)): if done_batch[i]: y_batch.append(reward_batch[i]) else: y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i]) y_batch = np.resize(y_batch, [BATCH_SIZE, 1]) # Update critic by minimizing the loss L self.critic_network.train(y_batch, state_batch, action_batch) # Update the actor policy using the sampled gradient: action_batch_for_gradients = self.actor_network.actions(state_batch) q_gradient_batch = self.critic_network.gradients( state_batch, action_batch_for_gradients) self.actor_network.train(q_gradient_batch, state_batch) # Update the target networks self.actor_network.update_target() self.critic_network.update_target() def noise_action(self, state): # Select action a_t according to the current policy and exploration noise action = self.actor_network.action(state) return action + self.exploration_noise.noise() def action(self, state): action = self.actor_network.action(state) return action def perceive(self, state, action, reward, next_state, done): # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer self.replay_buffer.add(state, action, reward, next_state, done) # Store transitions to replay start size then start training if self.replay_buffer.count() > REPLAY_START_SIZE: self.train() #if self.time_step % 10000 == 0: #self.actor_network.save_network(self.time_step) #self.critic_network.save_network(self.time_step) # Re-iniitialize the random process when an episode ends if done: self.exploration_noise.reset()
class NeuralAgent(): def __init__(self, track_name='practgt2.xml'): BUFFER_SIZE = 100000 TAU = 0.001 # Target Network HyperParameters LRA = 0.0001 # Learning rate for Actor LRC = 0.001 # Lerning rate for Critic state_dim = 29 # of sensors input self.batch_size = 32 self.lambda_mix = 10.0 self.action_dim = 3 # Steering/Acceleration/Brake # Tensorflow GPU optimization config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) from keras import backend as K K.set_session(sess) self.actor = ActorNetwork(sess, state_dim, self.action_dim, self.batch_size, TAU, LRA) self.critic = CriticNetwork(sess, state_dim, self.action_dim, self.batch_size, TAU, LRC) self.buff = ReplayBuffer(BUFFER_SIZE) # Create replay buffer self.track_name = track_name self.save = dict(total_reward=[], total_step=[], ave_reward=[], distRaced=[], distFromStart=[], lastLapTime=[], curLapTime=[], lapTimes=[], avelapTime=[], ave_sp=[], max_sp=[], min_sp=[], test_total_reward=[], test_total_step=[], test_ave_reward=[], test_distRaced=[], test_distFromStart=[], test_lastLapTime=[], test_curLapTime=[], test_lapTimes=[], test_avelapTime=[], test_ave_sp=[], test_max_sp=[], test_min_sp=[]) def rollout(self, env): max_steps = 10000 vision = False # zhichen: it is not stable to have two torcs env and UDP connections # env = TorcsEnv(vision=vision, throttle=True, gear_change=False, track_name=self.track_name) ob = env.reset(relaunch=True) s_t = np.hstack((ob.speedX, ob.angle, ob.trackPos, ob.speedY, ob.speedZ, ob.rpm, ob.wheelSpinVel / 100.0, ob.track)) total_reward = 0. sp = [] lastLapTime = [] for j_iter in range(max_steps): a_t = self.actor.model.predict(s_t.reshape(1, s_t.shape[0])) a_t = a_t[0] # print('test a_t:', a_t) a_t[0] = clip(a_t[0], -1, 1) a_t[1] = clip(a_t[1], 0, 1) a_t[2] = clip(a_t[2], 0, 1) ob, r_t, done, info = env.step(a_t) sp.append(info['speed']) if lastLapTime == []: if info['lastLapTime'] > 0: lastLapTime.append(info['lastLapTime']) elif info['lastLapTime'] > 0 and lastLapTime[-1] != info[ 'lastLapTime']: lastLapTime.append(info['lastLapTime']) if np.mod(j_iter + 1, 20) == 0: logging.info('step: ' + str(j_iter + 1)) print('\n ob: ', ob) s_t = np.hstack( (ob.speedX, ob.angle, ob.trackPos, ob.speedY, ob.speedZ, ob.rpm, ob.wheelSpinVel / 100.0, ob.track)) total_reward += r_t if done: break logging.info("Test Episode Reward: " + str(total_reward) + " Episode Length: " + str(j_iter + 1) + " Ave Reward: " + str(total_reward / (j_iter + 1)) + "\n Distance: " + str(info['distRaced']) + ' ' + str(info['distFromStart']) + "\n Last Lap Times: " + str(info['lastLapTime']) + " Cur Lap Times: " + str(info['curLapTime']) + " lastLaptime: " + str(lastLapTime) + "\n ave sp: " + str(np.mean(sp)) + " max sp: " + str(np.max(sp))) #logging.info(" Total Steps: " + str(step) + " " + str(i_episode) + "-th Episode Reward: " + str(total_reward) + # " Episode Length: " + str(j_iter+1) + " Distance" + str(ob.distRaced) + " Lap Times: " + str(ob.lastLapTime)) #env.end() # This is for shutting down TORCS ave_sp = np.mean(sp) max_sp = np.max(sp) min_sp = np.min(sp) return total_reward, j_iter + 1, info, ave_sp, max_sp, min_sp, lastLapTime def update_neural(self, controllers, episode_count=200, tree=False, seed=1337): OU = FunctionOU() vision = False GAMMA = 0.99 EXPLORE = 100000. max_steps = 10000 reward = 0 done = False step = 0 epsilon = 1 if not tree: steer_prog, accel_prog, brake_prog = controllers # Generate a Torcs environment env = TorcsEnv(vision=vision, throttle=True, gear_change=False, track_name=self.track_name) window = 5 lambda_store = np.zeros((max_steps, 1)) lambda_max = 40. factor = 0.8 logging.info("TORCS Experiment Start with Lambda = " + str(self.lambda_mix)) for i_episode in range(episode_count): logging.info("Episode : " + str(i_episode) + " Replay Buffer " + str(self.buff.count())) if np.mod(i_episode, 3) == 0: logging.info('relaunch TORCS') ob = env.reset( relaunch=True ) # relaunch TORCS every 3 episode because of the memory leak error else: logging.info('reset TORCS') ob = env.reset() #[ob.speedX, ob.angle, ob.trackPos, ob.speedY, ob.speedZ, ob.rpm, list(ob.wheelSpinVel / 100.0), list(ob.track)] s_t = np.hstack( (ob.speedX, ob.angle, ob.trackPos, ob.speedY, ob.speedZ, ob.rpm, ob.wheelSpinVel / 100.0, ob.track)) total_reward = 0. tempObs = [[ob.speedX], [ob.angle], [ob.trackPos], [ob.speedY], [ob.speedZ], [ob.rpm], list(ob.wheelSpinVel / 100.0), list(ob.track), [0, 0, 0]] window_list = [tempObs[:] for _ in range(window)] sp = [] lastLapTime = [] for j_iter in range(max_steps): if tree: tree_obs = [ sensor for obs in tempObs[:-1] for sensor in obs ] act_tree = controllers.predict([tree_obs]) steer_action = clip_to_range(act_tree[0][0], -1, 1) accel_action = clip_to_range(act_tree[0][1], 0, 1) brake_action = clip_to_range(act_tree[0][2], 0, 1) else: steer_action = clip_to_range( steer_prog.pid_execute(window_list), -1, 1) accel_action = clip_to_range( accel_prog.pid_execute(window_list), 0, 1) brake_action = clip_to_range( brake_prog.pid_execute(window_list), 0, 1) action_prior = [steer_action, accel_action, brake_action] tempObs = [[ob.speedX], [ob.angle], [ob.trackPos], [ob.speedY], [ob.speedZ], [ob.rpm], list(ob.wheelSpinVel / 100.0), list(ob.track), action_prior] window_list.pop(0) window_list.append(tempObs[:]) loss = 0 epsilon -= 1.0 / EXPLORE a_t = np.zeros([1, self.action_dim]) noise_t = np.zeros([1, self.action_dim]) a_t_original = self.actor.model.predict( s_t.reshape(1, s_t.shape[0])) noise_t[0][0] = max(epsilon, 0) * OU.function( a_t_original[0][0], 0.0, 0.60, 0.30) noise_t[0][1] = max(epsilon, 0) * OU.function( a_t_original[0][1], 0.5, 1.00, 0.10) noise_t[0][2] = max(epsilon, 0) * OU.function( a_t_original[0][2], 0, 1.00, 0.05) a_t[0][0] = a_t_original[0][0] + noise_t[0][0] a_t[0][1] = a_t_original[0][1] + noise_t[0][1] a_t[0][2] = a_t_original[0][2] + noise_t[0][2] mixed_act = [ a_t[0][k_iter] / (1 + self.lambda_mix) + (self.lambda_mix / (1 + self.lambda_mix)) * action_prior[k_iter] for k_iter in range(3) ] ob, r_t, done, info = env.step(mixed_act) sp.append(info['speed']) if lastLapTime == []: if info['lastLapTime'] > 0: lastLapTime.append(info['lastLapTime']) elif info['lastLapTime'] > 0 and lastLapTime[-1] != info[ 'lastLapTime']: lastLapTime.append(info['lastLapTime']) s_t1 = np.hstack( (ob.speedX, ob.angle, ob.trackPos, ob.speedY, ob.speedZ, ob.rpm, ob.wheelSpinVel / 100.0, ob.track)) self.buff.add(s_t, a_t[0], r_t, s_t1, done) # Add replay buffer # Do the batch update batch = self.buff.getBatch(self.batch_size) states = np.asarray([e[0] for e in batch]) actions = np.asarray([e[1] for e in batch]) rewards = np.asarray([e[2] for e in batch]) new_states = np.asarray([e[3] for e in batch]) dones = np.asarray([e[4] for e in batch]) y_t = np.zeros((states.shape[0], 1)) target_q_values = self.critic.target_model.predict( [new_states, self.actor.target_model.predict(new_states)]) for k in range(len(batch)): if dones[k]: y_t[k] = rewards[k] else: y_t[k] = rewards[k] + GAMMA * target_q_values[k] loss += self.critic.model.train_on_batch([states, actions], y_t) a_for_grad = self.actor.model.predict(states) grads = self.critic.gradients(states, a_for_grad) self.actor.train(states, grads) self.actor.target_train() self.critic.target_train() total_reward += r_t s_t = s_t1 # Control prior mixing term if j_iter > 0 and i_episode > 50: lambda_track = lambda_max * (1 - np.exp(-factor * np.abs( r_t + GAMMA * np.mean(target_q_values[-1] - base_q[-1])))) lambda_track = np.squeeze(lambda_track) else: lambda_track = 10. lambda_store[j_iter] = lambda_track base_q = copy.deepcopy(target_q_values) if np.mod(step, 2000) == 0: logging.info("Episode " + str(i_episode) + " Distance " + str(ob.distRaced) + " Lap Times " + str(ob.lastLapTime)) step += 1 if done: break #else: # env.end() self.lambda_mix = np.mean(lambda_store) logging.info('Episode ends! \n' + "Total Steps: " + str(step) + " " + str(i_episode) + "-th Episode Reward: " + str(total_reward) + " Episode Length: " + str(j_iter + 1) + " Ave Reward: " + str(total_reward / (j_iter + 1)) + "\n Distance: " + str(info['distRaced']) + ' ' + str(info['distFromStart']) + "\n Last Lap Times: " + str(info['lastLapTime']) + " Cur Lap Times: " + str(info['curLapTime']) + " lastLaptime: " + str(lastLapTime) + "\n ave sp: " + str(np.mean(sp)) + " max sp: " + str(np.max(sp))) #logging.info(" Lambda Mix: " + str(self.lambda_mix)) self.save['total_reward'].append(total_reward) self.save['total_step'].append(j_iter + 1) self.save['ave_reward'].append(total_reward / (j_iter + 1)) self.save['distRaced'].append(info['distRaced']) self.save['distFromStart'].append(info['distFromStart']) self.save['lastLapTime'].append(info['lastLapTime']) self.save['curLapTime'].append(info['curLapTime']) self.save['lapTimes'].append(lastLapTime) if lastLapTime == []: self.save['avelapTime'].append(0) else: self.save['avelapTime'].append(np.mean(lastLapTime)) self.save['ave_sp'].append(np.mean(sp)) self.save['max_sp'].append(np.max(sp)) self.save['min_sp'].append(np.min(sp)) # test if np.mod(i_episode + 1, 10) == 0: logging.info("Start Testing!") test_total_reward, test_step, test_info, test_ave_sp, test_max_sp, test_min_sp, test_lastLapTime = self.rollout( env) self.save['test_total_reward'].append(test_total_reward) self.save['test_total_step'].append(test_step) self.save['test_ave_reward'].append(test_total_reward / test_step) self.save['test_distRaced'].append(test_info['distRaced']) self.save['test_distFromStart'].append( test_info['distFromStart']) self.save['test_lastLapTime'].append(test_info['lastLapTime']) self.save['test_curLapTime'].append(test_info['curLapTime']) self.save['test_lapTimes'].append(test_lastLapTime) if test_lastLapTime == []: self.save['test_avelapTime'].append(0) else: self.save['test_avelapTime'].append( np.mean(test_lastLapTime)) self.save['test_ave_sp'].append(test_ave_sp) self.save['test_max_sp'].append(test_max_sp) self.save['test_min_sp'].append(test_min_sp) if np.mod(i_episode + 1, 5) == 0: print("Now we save model") #os.remove("actormodel.h5") self.actor.model.save_weights("actormodel_" + str(seed) + ".h5", overwrite=True) with open("actormodel.json", "w") as outfile: json.dump(self.actor.model.to_json(), outfile) #os.remove("criticmodel.h5") self.critic.model.save_weights("criticmodel_" + str(seed) + ".h5", overwrite=True) with open("criticmodel.json", "w") as outfile: json.dump(self.critic.model.to_json(), outfile) filename = "./model/actormodel_" + str(seed) + '_' + str( i_episode + 1) + ".h5" dirname = os.path.dirname(filename) if not os.path.exists(dirname): os.makedirs(dirname) self.actor.model.save_weights(filename, overwrite=True) filename = "./model/criticmodel_" + str(seed) + '_' + str( i_episode + 1) + ".h5" dirname = os.path.dirname(filename) if not os.path.exists(dirname): os.makedirs(dirname) self.critic.model.save_weights(filename, overwrite=True) if np.mod(i_episode + 1, 10) == 0: filename = "./Fig/iprl_save_" + str(seed) dirname = os.path.dirname(filename) if not os.path.exists(dirname): os.makedirs(dirname) with open(filename, 'wb') as f: pickle.dump(self.save, f) if i_episode > 1000 and all( np.array(self.save['total_reward'][-20:]) < 20): print('model degenerated. Stop at Epsisode ' + str(i_episode)) break env.end() # This is for shutting down TORCS logging.info("Neural Policy Update Finish.") return None def collect_data(self, controllers, tree=False): vision = False max_steps = 10000 step = 0 if not tree: steer_prog, accel_prog, brake_prog = controllers # Generate a Torcs environment env = TorcsEnv(vision=vision, throttle=True, gear_change=False, track_name=self.track_name) ob = env.reset(relaunch=True) print("S0=", ob) window = 5 lambda_store = np.zeros((max_steps, 1)) lambda_max = 40. factor = 0.8 logging.info("TORCS Collection started with Lambda = " + str(self.lambda_mix)) s_t = np.hstack((ob.speedX, ob.angle, ob.trackPos, ob.speedY, ob.speedZ, ob.rpm, ob.wheelSpinVel / 100.0, ob.track)) total_reward = 0. tempObs = [[ob.speedX], [ob.angle], [ob.trackPos], [ob.speedY], [ob.speedZ], [ob.rpm], list(ob.wheelSpinVel / 100.0), list(ob.track), [0, 0, 0]] window_list = [tempObs[:] for _ in range(window)] observation_list = [] actions_list = [] lastLapTime = [] sp = [] for j_iter in range(max_steps): if tree: tree_obs = [sensor for obs in tempObs[:-1] for sensor in obs] act_tree = controllers.predict([tree_obs]) steer_action = clip_to_range(act_tree[0][0], -1, 1) accel_action = clip_to_range(act_tree[0][1], 0, 1) brake_action = clip_to_range(act_tree[0][2], 0, 1) else: steer_action = clip_to_range( steer_prog.pid_execute(window_list), -1, 1) accel_action = clip_to_range( accel_prog.pid_execute(window_list), 0, 1) brake_action = clip_to_range( brake_prog.pid_execute(window_list), 0, 1) action_prior = [steer_action, accel_action, brake_action] tempObs = [[ob.speedX], [ob.angle], [ob.trackPos], [ob.speedY], [ob.speedZ], [ob.rpm], list(ob.wheelSpinVel / 100.0), list(ob.track), action_prior] window_list.pop(0) window_list.append(tempObs[:]) a_t = self.actor.model.predict(s_t.reshape(1, s_t.shape[0])) mixed_act = [ a_t[0][k_iter] / (1 + self.lambda_mix) + (self.lambda_mix / (1 + self.lambda_mix)) * action_prior[k_iter] for k_iter in range(3) ] if tree: newobs = [item for sublist in tempObs[:-1] for item in sublist] observation_list.append(newobs[:]) else: observation_list.append(window_list[:]) actions_list.append(mixed_act[:]) ob, r_t, done, info = env.step(mixed_act) sp.append(info['speed']) if lastLapTime == []: if info['lastLapTime'] > 0: lastLapTime.append(info['lastLapTime']) elif info['lastLapTime'] > 0 and lastLapTime[-1] != info[ 'lastLapTime']: lastLapTime.append(info['lastLapTime']) s_t1 = np.hstack( (ob.speedX, ob.angle, ob.trackPos, ob.speedY, ob.speedZ, ob.rpm, ob.wheelSpinVel / 100.0, ob.track)) total_reward += r_t s_t = s_t1 #if np.mod(step, 2000) == 0: # logging.info(" Distance " + str(ob.distRaced) + " Lap Times " + str(ob.lastLapTime)) step += 1 if done: break logging.info("Data Collection Finished!") logging.info('Episode ends! \n' + "Episode Reward: " + str(total_reward) + " Episode Length: " + str(j_iter + 1) + " Ave Reward: " + str(total_reward / (j_iter + 1)) + "\n Distance: " + str(info['distRaced']) + ' ' + str(info['distFromStart']) + "\n Last Lap Times: " + str(info['lastLapTime']) + " Cur Lap Times: " + str(info['curLapTime']) + " lastLaptime: " + str(lastLapTime) + "\n ave sp: " + str(np.mean(sp)) + " max sp: " + str(np.max(sp))) env.end() return observation_list, actions_list def label_data(self, controllers, observation_list, tree=False): if not tree: steer_prog, accel_prog, brake_prog = controllers actions_list = [] net_obs_list = [] logging.info("Data labelling started with Lambda = " + str(self.lambda_mix)) for window_list in observation_list: if tree: act_tree = controllers.predict([window_list]) steer_action = clip_to_range(act_tree[0][0], -1, 1) accel_action = clip_to_range(act_tree[0][1], 0, 1) brake_action = clip_to_range(act_tree[0][2], 0, 1) net_obs_list.append(window_list) else: steer_action = clip_to_range( steer_prog.pid_execute(window_list), -1, 1) accel_action = clip_to_range( accel_prog.pid_execute(window_list), 0, 1) brake_action = clip_to_range( brake_prog.pid_execute(window_list), 0, 1) net_obs = [sensor for obs in window_list[-1] for sensor in obs] net_obs_list.append(net_obs[:29]) action_prior = [steer_action, accel_action, brake_action] s_t = np.hstack([[net_obs[:29]]]) a_t = self.actor.model.predict(s_t.reshape(1, 29)) mixed_act = [ a_t[0][k_iter] / (1 + self.lambda_mix) + (self.lambda_mix / (1 + self.lambda_mix)) * action_prior[k_iter] for k_iter in range(3) ] actions_list.append(mixed_act[:]) return net_obs_list, observation_list, actions_list
class DDPG(object): def __init__(self, env): self.name = 'DDPG' # name for uploading results self.environment = env self.epsilon_expert_range = (1.0, 0.1) self.epsilon_expert = self.epsilon_expert_range[0] self.epsilon_random_range = (0.1, 0.01) self.epsilon_random = self.epsilon_random_range[0] # Randomly initialize actor network and critic network # with both their target networks # self.state_dim = env.observation_space.shape[0] self.state_dim = 16 # self.action_dim = env.action_space.shape[0] self.action_dim = 3 self.time_step = 0 self.sess = tf.InteractiveSession() self.actor_network = ActorNetwork(self.sess, self.state_dim, self.action_dim) self.critic_network = CriticNetwork(self.sess, self.state_dim, self.action_dim) # initialize replay buffer self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration # self.exploration_noise = OUNoise(self.action_dim) # self.exploration_noise = OUNoise() self.OU = OU() # loading networks self.saver = tf.train.Saver() checkpoint = tf.train.get_checkpoint_state(MODEL_PATH) if checkpoint and checkpoint.model_checkpoint_path: path = checkpoint.model_checkpoint_path self.saver.restore(self.sess, path) self.time_step = int(path[path.rindex('-') + 1:]) self.epsilon_expert -= ( self.epsilon_expert_range[0] - self.epsilon_expert_range[1]) * self.time_step / EXPLORE_COUNT self.epsilon_expert = max(self.epsilon_expert, self.epsilon_expert_range[1]) self.epsilon_random -= ( self.epsilon_random_range[0] - self.epsilon_random_range[1]) * self.time_step / EXPLORE_COUNT self.epsilon_random = max(self.epsilon_random, self.epsilon_random_range[1]) logger.warn( "Successfully loaded: %s, step: %d, epsilon_expert: %s, epsilon_random: %s" % (path, self.time_step, self.epsilon_expert, self.epsilon_random)) else: logger.warn("Could not find old network weights") self.critic_cost = 0 def train(self): self.time_step = self.time_step + 1 self.epsilon_expert -= (self.epsilon_expert_range[0] - self.epsilon_expert_range[1]) / EXPLORE_COUNT self.epsilon_expert = max(self.epsilon_expert, self.epsilon_expert_range[1]) self.epsilon_random -= (self.epsilon_random_range[0] - self.epsilon_random_range[1]) / EXPLORE_COUNT self.epsilon_random = max(self.epsilon_random, self.epsilon_random_range[1]) logger.debug( "step: %d, epsilon_expert: %s, epsilon_random: %s" % (self.time_step, self.epsilon_expert, self.epsilon_random)) # Sample a random minibatch of N transitions from replay buffer minibatch = self.replay_buffer.get_batch(BATCH_SIZE) state_batch = np.asarray([data[0] for data in minibatch]) action_batch = np.asarray([data[1] for data in minibatch]) reward_batch = np.asarray([data[2] for data in minibatch]) next_state_batch = np.asarray([data[3] for data in minibatch]) done_batch = np.asarray([data[4] for data in minibatch]) # for action_dim = 1 action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim]) # Calculate y_batch next_action_batch = self.actor_network.target_actions(next_state_batch) q_value_batch = self.critic_network.target_q(next_state_batch, next_action_batch) y_batch = [] for i in range(len(minibatch)): y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i]) # if done_batch[i]: # y_batch.append(reward_batch[i]) # else : # y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i]) y_batch = np.resize(y_batch, [BATCH_SIZE, 1]) # Update critic by minimizing the loss L self.critic_cost = self.critic_network.train(y_batch, state_batch, action_batch) # Update the actor policy using the sampled gradient: action_batch_for_gradients = self.actor_network.actions(state_batch) q_gradient_batch = self.critic_network.gradients( state_batch, action_batch_for_gradients) self.actor_network.train(q_gradient_batch, state_batch) # Update the target networks self.actor_network.update_target() self.critic_network.update_target() # def noise_action(self,state): # # Select action a_t according to the current policy and exploration noise # action = self.actor_network.action(state) # noise = self.exploration_noise.noise(action) # noise_action = action + noise # clipped_noise_action = np.clip(noise_action, 0, 1) # return clipped_noise_action # def noise_action(self,state): # # Select action a_t according to the current policy and exploration noise # action = self.actor_network.action(state) # noise = np.zeros(self.action_dim) # noise[0] = self.epsilon * self.OU.function(action[0], 0.5, 1.00, 0.10) # noise[1] = self.epsilon * self.OU.function(action[1], 0.5, 1.00, 0.10) # noise[2] = self.epsilon * self.OU.function(action[2], 0.5, 1.00, 0.10) # noise_action = action + noise # logger.debug("action: %s, noise: %s" % (action, noise)) # clipped_noise_action = np.clip(noise_action, 0, 1) # return clipped_noise_action def action(self, state): action = self.actor_network.action(state) logger.debug("action: %s" % (action)) return action def opposite_action(self, state): logger.debug("state: %s" % (state)) action = self.actor_network.action(state) logger.debug("action: %s" % (action)) action[0] = 1 - action[0] logger.debug("opposite action: %s" % (action)) return action def perceive(self, state, action, reward, next_state, done): # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer self.replay_buffer.add(state, action, reward, next_state, done) # self.time_step = self.time_step + 1 # Store transitions to replay start size then start training if self.replay_buffer.count() >= REPLAY_START_SIZE: # logger.debug("train...") self.train() #if self.time_step % 10000 == 0: #self.actor_network.save_network(self.time_step) #self.critic_network.save_network(self.time_step) # Re-iniitialize the random process when an episode ends # if done: # self.exploration_noise.reset() def saveNetwork(self): logger.warn("time step: %s, save model" % (self.time_step)) ckpt_file = os.path.join(MODEL_PATH, 'DDPG') self.saver.save(self.sess, ckpt_file, global_step=self.time_step)
class Worker: """docstring for DDPG""" def __init__(self, sess, number, model_path, global_episodes, explore, decay, training): self.name = 'worker_' + str(number) # name for uploading results self.number = number # Randomly initialize actor network and critic network # with both their target networks self.state_dim = 41 self.action_dim = 18 self.model_path = model_path self.global_episodes = global_episodes self.increment = self.global_episodes.assign_add(1) self.sess = sess self.explore = explore self.decay = decay self.training = training self.actor_network = ActorNetwork(self.sess, self.state_dim, self.action_dim, self.name + '/actor') self.actor_network.update_target(self.sess) self.critic_network = CriticNetwork(self.sess, self.state_dim, self.action_dim, self.name + '/critic') self.critic_network.update_target(self.sess) # initialize replay buffer self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(self.action_dim) self.update_local_ops_actor = update_target_graph( 'global/actor', self.name + '/actor') self.update_local_ops_critic = update_target_graph( 'global/critic', self.name + '/critic') def start(self, setting=0): self.env = RunEnv(visualize=True) self.setting = setting def train(self): #print "train step",self.time_step # Sample a random minibatch of N transitions from replay buffer minibatch = self.replay_buffer.get_batch(BATCH_SIZE) state_batch = np.asarray([data[0] for data in minibatch]) action_batch = np.asarray([data[1] for data in minibatch]) reward_batch = np.asarray([data[2] for data in minibatch]) next_state_batch = np.asarray([data[3] for data in minibatch]) done_batch = np.asarray([data[4] for data in minibatch]) # for action_dim = 1 action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim]) # Calculate y_batch next_action_batch = self.actor_network.target_actions( self.sess, next_state_batch) q_value_batch = self.critic_network.target_q(self.sess, next_state_batch, next_action_batch) y_batch = [] for i in range(len(minibatch)): if done_batch[i]: y_batch.append(reward_batch[i]) else: y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i]) y_batch = np.resize(y_batch, [BATCH_SIZE, 1]) # Update critic by minimizing the loss L self.critic_network.train(y_batch, state_batch, action_batch) # Update the actor policy using the sampled gradient: action_batch_for_gradients = self.actor_network.actions( self.sess, selfstate_batch) q_gradient_batch = self.critic_network.gradients( self.sess, state_batch, action_batch_for_gradients) self.actor_network.train(self.sess, q_gradient_batch, state_batch) # Update the target networks self.actor_network.update_target(self.sess) self.critic_network.update_target(self.sess) def save_model(self, saver, episode): #if self.episode % 10 == 1: if self.name == 'worker_0': saver.save(self.sess, self.model_path + "/model-" + str(episode) + ".ckpt") def noise_action(self, state, decay): # Select action a_t according to the current policy and exploration noise which gradually vanishes action = self.actor_network.action(self.sess, state) return action + self.exploration_noise.noise() * decay def action(self, state): action = self.actor_network.action(self.sess, state) return action def perceive(self, state, action, reward, next_state, done): # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer self.replay_buffer.add(state, action, reward, next_state, done) # Store transitions to replay start size then start training if self.replay_buffer.count() > REPLAY_START_SIZE and self.training: self.train() #if self.time_step % 10000 == 0: #self.actor_network.save_network(self.time_step) #self.critic_network.save_network(self.time_step) # Re-iniitialize the random process when an episode ends if done: self.exploration_noise.reset() def work(self, coord, saver): if self.training: episode_count = self.sess.run(self.global_episodes) else: episode_count = 0 wining_episode_count = 0 total_steps = 0 print("Starting worker_" + str(self.number)) with self.sess.as_default(), self.sess.graph.as_default(): #not_start_training_yet = True while not coord.should_stop(): returns = [] rewards = [] episode_reward = 0 if np.random.rand( ) < 0.9: # change Aug20 stochastic apply noise noisy = True self.decay -= 1. / self.explore else: noisy = False self.sess.run(self.update_local_ops_actor) self.sess.run(self.update_local_ops_critic) state = self.env.reset(difficulty=self.setting) #print(observation) s = process_frame(state) print "episode:", episode_count # Train for step in xrange(self.env.spec.timestep_limit): state = process_frame(state) if noisy: action = np.clip( self.noise_action(state, np.maximum(self.decay, 0)), 0.0, 1.0 ) # change Aug20, decay noise (no noise after ep>=self.explore) else: action = self.action(state) next_state, reward, done, _ = self.env.step(action) #print('state={}, action={}, reward={}, next_state={}, done={}'.format(state, action, reward, next_state, done)) next_state = process_frame(next_state) self.perceive(state, action, reward * 100, next_state, done) state = next_state episode_reward += reward if done: break if episode % 5 == 0: print "episode reward:", reward_episode # Testing: #if episode % 1 == 0: if self.name == 'worker_0' and episode_count % 50 == 0 and episode_count > 1: # change Aug19 self.save_model(saver, episode_count) total_return = 0 ave_reward = 0 for i in xrange(TEST): state = self.env.reset() reward_per_step = 0 for j in xrange(self.env.spec.timestep_limit): action = self.action( process_frame(state)) # direct action for test state, reward, done, _ = self.env.step(action) total_return += reward if done: break reward_per_step += (reward - reward_per_step) / (j + 1) ave_reward += reward_per_step ave_return = total_return / TEST ave_reward = ave_reward / TEST returns.append(ave_return) rewards.append(ave_reward) print 'episode: ', episode, 'Evaluation Average Return:', ave_return, ' Evaluation Average Reward: ', ave_reward if self.name == 'worker_0' and self.training: sess.run(self.increment) episode_count += 1 # All done Stop trail # Confirm exit print('Done ' + self.name)
class DDPG: def __init__(self, env): self.name = 'DDPG' # name for uploading results self.environment = env # Randomly initialize actor network and critic network # with both their target networks self.state_dim = env.observation_space.shape[0] # self.state_dim = env.observation_space.shape[0] * 2 self.action_dim = env.action_space.shape[0] self.time_step = 0 self.sess = tf.InteractiveSession() self.actor_network = ActorNetwork(self.sess, self.state_dim, self.action_dim) self.critic_network = CriticNetwork(self.sess, self.state_dim, self.action_dim) # initialize replay buffer self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration # self.exploration_noise = OUNoise(self.action_dim) self.exploration_noise = OUNoise() # loading networks self.saver = tf.train.Saver() checkpoint = tf.train.get_checkpoint_state(MODEL_PATH) if checkpoint and checkpoint.model_checkpoint_path: self.saver.restore(self.sess, checkpoint.model_checkpoint_path) my_config.logger.warn("Successfully loaded: %s" % (checkpoint.model_checkpoint_path)) else: my_config.logger.error("Could not find old network weights") def train(self): # my_config.logger.debug("......enter tain......") # Sample a random minibatch of N transitions from replay buffer minibatch = self.replay_buffer.get_batch(BATCH_SIZE) state_batch = np.asarray([data[0] for data in minibatch]) action_batch = np.asarray([data[1] for data in minibatch]) reward_batch = np.asarray([data[2] for data in minibatch]) next_state_batch = np.asarray([data[3] for data in minibatch]) done_batch = np.asarray([data[4] for data in minibatch]) # for action_dim = 1 action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim]) # Calculate y_batch next_action_batch = self.actor_network.target_actions(next_state_batch) q_value_batch = self.critic_network.target_q(next_state_batch, next_action_batch) y_batch = [] for i in range(len(minibatch)): if done_batch[i]: y_batch.append(reward_batch[i]) else: y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i]) y_batch = np.resize(y_batch, [BATCH_SIZE, 1]) # Update critic by minimizing the loss L self.critic_network.train(y_batch, state_batch, action_batch) # Update the actor policy using the sampled gradient: action_batch_for_gradients = self.actor_network.actions(state_batch) q_gradient_batch = self.critic_network.gradients( state_batch, action_batch_for_gradients) self.actor_network.train(q_gradient_batch, state_batch) # Update the target networks self.actor_network.update_target() self.critic_network.update_target() def noise_action(self, state): # Select action a_t according to the current policy and exploration noise action = self.actor_network.action(state) noise = self.exploration_noise.noise(action) # if random.random() <= 0.5: # noise = self.exploration_noise.noise(action, # mu=[0, 0, 0, 1, 0, 0, 0.25, 0.75, 0.75, 0, 0, 0, 0, 0.5, 0.5, 0, 0, 0.5]) # else: # noise = self.exploration_noise.noise(action, # mu=[0, 0, 0, 0, 0.5, 0.5, 0, 0, 0.5, 0, 0, 0, 1, 0, 0, 0.25, 0.75, 0.75]) noise_action = action + noise clipped_noise_action = np.clip(noise_action, 0, 1) # if (self.time_step < 5): # my_config.logger.debug("action: %s, noise: %s, clip: %s" % (action, noise, clipped_noise_action)) return clipped_noise_action def action(self, state): action = self.actor_network.action(state) return action def perceive(self, state, action, reward, next_state, done): # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer self.replay_buffer.add(state, action, reward, next_state, done) self.time_step = self.time_step + 1 # Store transitions to replay start size then start training if self.replay_buffer.count() > REPLAY_START_SIZE: self.train() #if self.time_step % 10000 == 0: #self.actor_network.save_network(self.time_step) #self.critic_network.save_network(self.time_step) # Re-iniitialize the random process when an episode ends # if done: # self.exploration_noise.reset() def saveNetwork(self): # my_config.logger.warn("time step: %s, save model" % (self.time_step)) ckpt_file = os.path.join(MODEL_PATH, 'ltr') self.saver.save(self.sess, ckpt_file, global_step=self.time_step)
class DDPG: """docstring for DDPG""" def __init__(self): self.name = 'DDPG' # name for uploading results # self.environment = env # Randomly initialize actor network and critic network # with both their target networks self.state_dim = 12 self.action_dim = 10 self.has_kicked = False self.laststep_haskicked = False self.sess = tf.InteractiveSession() self.actor_network = ActorNetwork(self.sess, self.state_dim, self.action_dim) self.critic_network = CriticNetwork(self.sess, self.state_dim, self.action_dim) self.saver = tf.train.Saver(max_to_keep=1) # initialize replay buffer self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(self.action_dim) def train(self): #print "train step",self.time_step # Sample a random minibatch of N transitions from replay buffer minibatch = self.replay_buffer.get_batch(BATCH_SIZE) # print(minibatch) state_batch = np.asarray([data[0] for data in minibatch]) action_batch = np.asarray([data[1] for data in minibatch]) reward_batch = np.asarray([data[2] for data in minibatch]) next_state_batch = np.asarray([data[3] for data in minibatch]) done_batch = np.asarray([data[4] for data in minibatch]) # for action_dim = 1 action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim]) # Calculate y_batch next_action_batch = self.actor_network.target_actions(next_state_batch) q_value_batch = self.critic_network.target_q(next_state_batch, next_action_batch) # print(q_value_batch) y_batch = [] for i in range(len(minibatch)): if done_batch[i]: y_batch.append(reward_batch[i]) else: y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i]) y_batch = np.resize(y_batch, [BATCH_SIZE, 1]) # Update critic by minimizing the loss L self.critic_network.train(y_batch, state_batch, action_batch) # Update the actor policy using the sampled gradient: action_batch_for_gradients = self.actor_network.actions(state_batch) with open('/home/ruizhao/Desktop/a.txt', 'a') as f: print("action_batch[0]", file=f) print(action_batch[0], file=f) q_gradient_batch = self.critic_network.gradients( state_batch, action_batch_for_gradients) with open('/home/ruizhao/Desktop/a.txt', 'a') as f: print("q_gradient_batch[0]", file=f) print(q_gradient_batch[0], file=f) self.actor_network.train(q_gradient_batch, state_batch) # Update the target networks self.actor_network.update_target() self.critic_network.update_target() def noise_action2(self, state): # Select action a_t according to the current policy and exploration noise action = self.actor_network.action(state) return action + self.exploration_noise.noise() def noise_action(self, state): action = self.actor_network.action(state) random_action = np.zeros(10, float) random_action[random.randint(0, 3)] = 1 random_action[4] = random.uniform(-100, 100) #DASH POWER random_action[5] = random.uniform(-180, 180) #DASH DEGREES random_action[6] = random.uniform(-180, 180) #TURN DEGREES random_action[7] = random.uniform(-180, 180) #TACKLE DEGREES random_action[8] = random.uniform(0, 100) #KICK POWER random_action[9] = random.uniform(-180, 180) #KICK DEGREES if np.random.uniform() < EPSILON: return action else: return random_action def action(self, state): action = self.actor_network.action(state) return action def perceive(self, state, action, reward, next_state, done): # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer self.replay_buffer.add(state, action, reward, next_state, done) # Store transitions to replay start size then start training if self.replay_buffer.count() > REPLAY_START_SIZE: self.train() #if self.time_step % 10000 == 0: #self.actor_network.save_network(self.time_step) #self.critic_network.save_network(self.time_step) # Re-iniitialize the random process when an episode ends if done: self.exploration_noise.reset()
class DDPG: def __init__(self, env, state_dim, action_dim): self.name = 'DDPG' self.environment = env self.time_step = 0 self.state_dim = state_dim self.action_dim = action_dim self.sess = tf.InteractiveSession() self.actor_network = ActorNetwork(self.sess, self.state_dim, self.action_dim) self.critic_network = CriticNetwork(self.sess, self.state_dim, self.action_dim) # initialize replay buffer self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.linear_noise = OUNoise(1, 0.5, 0.3, 0.6) self.angular_noise = OUNoise(1, 0, 0.6, 0.8) def train(self): minibatch = self.replay_buffer.get_batch(BATCH_SIZE) state_batch = np.asarray([data[0] for data in minibatch]) action_batch = np.asarray([data[1] for data in minibatch]) reward_batch = np.asarray([data[2] for data in minibatch]) next_state_batch = np.asarray([data[3] for data in minibatch]) done_batch = np.asarray([data[4] for data in minibatch]) # for action_dim = 1 action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim]) next_action_batch = self.actor_network.target_actions(next_state_batch) q_value_batch = self.critic_network.target_q(next_state_batch, next_action_batch) y_batch = [] for i in range(len(minibatch)): if done_batch[i]: y_batch.append(reward_batch[i]) else: y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i]) y_batch = np.resize(y_batch, [BATCH_SIZE, 1]) # Update critic by minimizing the loss L self.critic_network.train(y_batch, state_batch, action_batch) # Update the actor policy using the sampled gradient: action_batch_for_gradients = self.actor_network.actions(state_batch) q_gradient_batch = self.critic_network.gradients( state_batch, action_batch_for_gradients) self.actor_network.train(q_gradient_batch, state_batch) # Update the target networks self.actor_network.update_target() self.critic_network.update_target() def noise_action(self, state, epsilon): action = self.actor_network.action(state) noise_t = np.zeros(self.action_dim) noise_t[0] = epsilon * self.linear_noise.noise() noise_t[1] = epsilon * self.angular_noise.noise() action = action + noise_t a_linear = np.clip(action[0], 0, 1) a_linear = round(a_linear, 1) a_angular = np.clip(action[1], -1, 1) a_angular = round(a_angular, 1) #print(a_linear, a_angular) return [a_linear, a_angular] def action(self, state): action = self.actor_network.action(state) a_linear = np.clip(action[0], 0, 1) a_linear = round(a_linear, 1) a_angular = np.clip(action[1], -1, 1) a_angular = round(a_angular, 1) return [a_linear, a_angular] def perceive(self, state, action, reward, next_state, done): # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer self.replay_buffer.add(state, action, reward, next_state, done) if self.replay_buffer.count() == REPLAY_START_SIZE: print('\n---------------Start training---------------') # Store transitions to replay start size then start training if self.replay_buffer.count() > REPLAY_START_SIZE: self.time_step += 1 self.train() if self.time_step % 10000 == 0 and self.time_step > 0: self.actor_network.save_network(self.time_step) self.critic_network.save_network(self.time_step) if done: self.linear_noise.reset() self.angular_noise.reset() return self.time_step
class ddpg: def __init__(self, env_name, sess, state_dim, action_dim, models_dir, img_dim): self.name = 'DDPG' self.env_name = env_name self.state_dim = state_dim self.action_dim = action_dim self.img_dim = img_dim self.models_dir = models_dir # Ensure action bound is symmetric self.time_step = 0 self.sess = sess self.actor_network = ActorNetwork(self.sess, self.state_dim, self.action_dim, self.img_dim) self.critic_network = CriticNetwork(self.sess, self.state_dim, self.action_dim, self.img_dim) # initialize replay buffer self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) self.saver = tf.train.Saver() def train(self): minibatch = self.replay_buffer.getBatch(BATCH_SIZE) state_batch = np.asarray([data[0] for data in minibatch]) action_batch = np.asarray([data[1] for data in minibatch]) reward_batch = np.asarray([data[2] for data in minibatch]) next_state_batch = np.asarray([data[3] for data in minibatch]) done_batch = np.asarray([data[4] for data in minibatch]) img_batch = np.asarray([data[5] for data in minibatch]) next_img_batch = np.asarray([data[6] for data in minibatch]) # for action_dim = 1 action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim]) # Calculate y_batch next_action_batch = self.actor_network.target_actions( next_state_batch, next_img_batch) q_value_batch = self.critic_network.target_q(next_state_batch, next_action_batch, next_img_batch) y_batch = [] for i in range(len(minibatch)): if done_batch[i]: y_batch.append(reward_batch[i]) else: y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i]) y_batch = np.resize(y_batch, [BATCH_SIZE, 1]) critic_cost = self.critic_network.train(y_batch, state_batch, action_batch, img_batch) # Update the actor policy using the sampled gradient: action_batch_for_gradients = self.actor_network.actions( state_batch, img_batch) q_gradient_batch = self.critic_network.gradients( state_batch, action_batch_for_gradients, img_batch) self.actor_network.train(q_gradient_batch, state_batch, img_batch) # Update the target networks self.actor_network.update_target() self.critic_network.update_target() return critic_cost def save_network(self, step): self.saver.save(self.sess, self.models_dir + self.env_name + '-network-ddpg.ckpt', global_step=step) def load_network(self): checkpoint = tf.train.get_checkpoint_state(self.models_dir) if checkpoint and checkpoint.model_checkpoint_path: self.saver.restore(self.sess, checkpoint.model_checkpoint_path) print("Successfully loaded:", checkpoint.model_checkpoint_path) else: print("Could not find old network weights") ''' def action(self,state): action = self.actor_network.action(state) action[0][0] = np.clip( action[0][0], -1 , 1 ) action[0][1] = np.clip( action[0][1], 0 , 1 ) action[0][2] = np.clip( action[0][2], 0 , 1 ) #print "Action:", action return action[0] def noise_action(self,state,epsilon): # Select action a_t according to the current policy and exploration noise action = self.actor_network.action(state) print action.shape print "Action_No_Noise:", action noise_t = np.zeros([1,self.action_dim]) noise_t[0][0] = epsilon * self.OU.function(action[0][0], 0.0 , 0.60, 0.80) noise_t[0][1] = epsilon * self.OU.function(action[0][1], 0.5 , 1.00, 0.10) noise_t[0][2] = epsilon * self.OU.function(action[0][2], -0.1 , 1.00, 0.05) action = action+noise_t action[0][0] = np.clip( action[0][0], -1 , 1 ) action[0][1] = np.clip( action[0][1], 0 , 1 ) action[0][2] = np.clip( action[0][2], 0 , 1 ) print "Action_Noise:", action return action[0] ''' def action(self, state, img): action = self.actor_network.action(state, img) action[0] = np.clip(action[0], -1, 1) # action[1] = np.clip( action[1], 0 , 1 ) # action[2] = np.clip( action[2], 0 , 1 ) return action def noise_action(self, state, epsilon, img): # Select action a_t according to the current policy and exploration noise action = self.actor_network.action(state, img) noise_t = np.zeros(self.action_dim) if self.time_step < 100000: noise_t[0] = epsilon * ornstein_uhlenbeck_process( action[0], 0.0, 0.60, 0.80) # noise_t[1] = epsilon * ornstein_uhlenbeck_process(action[1], 0.5 , 1.00, 0.10) # noise_t[2] = epsilon * ornstein_uhlenbeck_process(action[2], -0.1 , 1.00, 0.05) elif self.time_step < 200000: if np.random.random() < 0.1: noise_t[0] = 0.1 * ornstein_uhlenbeck_process( action[0], 0.0, 0.60, 0.80) action = action + noise_t action[0] = np.clip(action[0], -1, 1) # action[1] = np.clip( action[1], 0 , 1) # action[2] = np.clip( action[2], 0 , 1) return action def perceive(self, state, action, reward, next_state, done, img, next_img): if not (math.isnan(reward)): self.replay_buffer.add(state, action, reward, next_state, done, img, next_img) self.time_step = self.time_step + 1 # Return critic cost if self.replay_buffer.count() > REPLAY_START_SIZE: return self.train() else: return 0
class DDPG_TF: """docstring for DDPG""" def __init__(self, env,loadfilename=None,printVars=False): self.name = 'DDPG' # name for uploading results self.environment = env # Randomly initialize actor network and critic network # with both their target networks self.state_dim = env.observation_space.shape[0] self.action_dim = env.action_space.shape[0] self.sess = tf.InteractiveSession() self.actor_network = ActorNetwork(self.sess,self.state_dim,self.action_dim) self.critic_network = CriticNetwork(self.sess,self.state_dim,self.action_dim) #print 'init complete' self.all_vars = tf.global_variables() if printVars: for v in self.all_vars: print v.name.ljust(30), v.shape self.saver = tf.train.Saver(self.all_vars) if loadfilename is not None: self.saver.restore(self.sess, loadfilename) #print 'restore complete' def train(self): #print "train step",self.time_step # Sample a random minibatch of N transitions from replay buffer minibatch = self.replay_buffer.get_batch(BATCH_SIZE) state_batch = np.asarray([data[0] for data in minibatch]) action_batch = np.asarray([data[1] for data in minibatch]) reward_batch = np.asarray([data[2] for data in minibatch]) next_state_batch = np.asarray([data[3] for data in minibatch]) done_batch = np.asarray([data[4] for data in minibatch]) # for action_dim = 1 action_batch = np.resize(action_batch,[BATCH_SIZE,self.action_dim]) # Calculate y_batch next_action_batch = self.actor_network.target_actions(next_state_batch) q_value_batch = self.critic_network.target_q(next_state_batch,next_action_batch) y_batch = [] for i in range(len(minibatch)): if done_batch[i]: y_batch.append(reward_batch[i]) else : y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i]) y_batch = np.resize(y_batch,[BATCH_SIZE,1]) # Update critic by minimizing the loss L self.critic_network.train(y_batch,state_batch,action_batch) # Update the actor policy using the sampled gradient: action_batch_for_gradients = self.actor_network.actions(state_batch) q_gradient_batch = self.critic_network.gradients(state_batch,action_batch_for_gradients) self.actor_network.train(q_gradient_batch,state_batch) # Update the target networks self.actor_network.update_target() self.critic_network.update_target() def noise_action(self,state): # Select action a_t according to the current policy and exploration noise action = self.actor_network.action(state) return action+self.exploration_noise.noise() def action(self,state): action = self.actor_network.action(state) return action def actions(self, states): actions = self.actor_network.actions_no_training(states) return actions def target_actions(self, states): actions = self.actor_network.target_actions(states) return actions def value(self, states): actions = self.actor_network.actions_no_training(states) values = self.critic_network.q_value(states,actions) return values def perceive(self,state,action,reward,next_state,done): # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer self.replay_buffer.add(state,action,reward,next_state,done) # Store transitions to replay start size then start training if self.replay_buffer.count() > REPLAY_START_SIZE: self.train() #if self.time_step % 10000 == 0: #self.actor_network.save_network(self.time_step) #self.critic_network.save_network(self.time_step) # Re-iniitialize the random process when an episode ends if done: self.exploration_noise.reset()
class DDPG: """docstring for DDPG""" def __init__(self, environment): self.name = 'DDPG' # name for uploading results self.environment = environment # Randomly initialize actor network and critic network # with both their target networks self.actor_network = ActorNetwork( state_size=environment.observation_space.shape[0], action_size=environment.action_space.shape[0]) self.critic_network = CriticNetwork( state_size=environment.observation_space.shape[0], action_size=environment.action_space.shape[0]) # initialize replay buffer self.replay_buffer = deque() # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(environment.action_space.shape[0]) # Initialize time step self.time_step = 0 def set_init_observation(self, observation): # receive initial observation state self.state = observation def train(self): # Sample a random minibatch of N transitions from replay buffer minibatch = random.sample(self.replay_buffer, BATCH_SIZE) state_batch = [data[0] for data in minibatch] action_batch = [data[1] for data in minibatch] reward_batch = [data[2] for data in minibatch] next_state_batch = [data[3] for data in minibatch] action_batch = np.resize(action_batch, [BATCH_SIZE, 1]) # Calculate y y_batch = [] next_action_batch = self.actor_network.target_evaluate( next_state_batch) q_value_batch = self.critic_network.target_evaluate( next_state_batch, next_action_batch) for i in range(0, BATCH_SIZE): done = minibatch[i][4] if done: y_batch.append(reward_batch[i]) else: y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i]) # Update critic by minimizing the loss L self.critic_network.train(y_batch, state_batch, action_batch) # Update the actor policy using the sampled gradient: action_batch_for_gradients = self.actor_network.evaluate(state_batch) q_gradient_batch = self.critic_network.gradients( state_batch, action_batch_for_gradients) / BATCH_SIZE self.actor_network.train(q_gradient_batch, state_batch) # Update the target networks self.actor_network.update_target() self.critic_network.update_target() def get_action(self): # Select action a_t according to the current policy and exploration noise action = self.actor_network.get_action(self.state) return np.clip(action + self.exploration_noise.noise(), self.environment.action_space.low, self.environment.action_space.high) def set_feedback(self, observation, action, reward, done): # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer next_state = observation self.replay_buffer.append( (self.state, action, reward, next_state, done)) # Update current state self.state = next_state # Update time step self.time_step += 1 # Limit the replay buffer size if len(self.replay_buffer) > REPLAY_BUFFER_SIZE: self.replay_buffer.popleft() # Store transitions to replay start size then start training if self.time_step > REPLAY_START_SIZE: self.train() if self.time_step % 10000 == 0: self.actor_network.save_network(self.time_step) self.critic_network.save_network(self.time_step) # Re-iniitialize the random process when an episode ends if done: self.exploration_noise.reset()
class APLDDPGAgent(AbstractAgent): name = "apl_ddpg" def __init__(self, env, iter=200000, *args, **kwargs): # create the actor model # create the critic model self.env = env self.action_dim = sum( sum(1 for i in row if i) for row in self.env.action_space.sample()) self.observation = env.reset() self.state_dim = self.observation.shape print ">>>>>>>>>>>>>>>>>>>>>state dim " + str(self.state_dim) self.nn_action_dim = 6 # limit ddpg network output to 3 DOF self.noise = OUProcess(self.nn_action_dim, mu=OU_MEAN, theta=OU_THETA, sigma=EPSILON_RANGE[0]) def fit(self, *args, **kwargs): MEM_SZ = MEM_SIZE_FCL sess = K.get_session() K.set_learning_phase(1) self.actor = ActorNetwork(sess, self.state_dim, self.nn_action_dim, BATCH_SIZE, TAU, LRA, convolutional=CONVOLUTIONAL, output_activation=ACTION_ACTIVATION) self.critic = CriticNetwork(sess, self.state_dim, self.nn_action_dim, BATCH_SIZE, TAU, LRC, convolutional=CONVOLUTIONAL) self.memory = Memory(MEM_SZ) self.actor.target_model.summary() self.critic.target_model.summary() if LOAD_WEIGHTS: self.actor.model.load_weights(LOAD_WEIGHTS_PREFIX + "actor_model_" + LOAD_WEIGHTS_EPISODE + ".h5") self.critic.model.load_weights(LOAD_WEIGHTS_PREFIX + "critic_model_" + LOAD_WEIGHTS_EPISODE + ".h5") self.actor.target_model.load_weights(LOAD_WEIGHTS_PREFIX + "actor_target_model_" + LOAD_WEIGHTS_EPISODE + ".h5") self.critic.target_model.load_weights(LOAD_WEIGHTS_PREFIX + "critic_target_model_" + LOAD_WEIGHTS_EPISODE + ".h5") print("Weights Loaded!") #==================================================== #Initialize noise processes #self.noise_procs = [] #for i in range(NUM_NOISE_PROCS): # self.noise_procs.append(OUProcess(OU_MEAN, OU_THETA, OU_STD_DEV)) #==================================================== PRE_LEARNING_EPISODES = STARTING_EPISODE + PRE_LEARNING_EPS steps = STARTING_EPISODE * EPISODE_LENGTH start_time = time.time() last_ep_time = time.time() if MAKE_PLOT: reward_graph = Grapher() for ep in range(STARTING_EPISODE, EPISODES): #reset noise processes #for ou in self.noise_procs: # ou.reset() self.noise.reset() #start time counter if (ep == PRE_LEARNING_EPISODES): start_time = time.time() print("Episode: " + str(ep) + " Frames: " + str(ep * EPISODE_LENGTH) + " Uptime: " + str( (time.time() - start_time) / 3600.0) + " hrs ===========") state = self.env.reset() play_only = (ep % 10 == 0) total_reward = 0 if play_only or ALREADY_TRAINED: for step in range(TEST_EPISODE_LENGTH): #print ">>>>>>>>>>>>>", state.shape #img = np.array([np.subtract(img, 128)], dtype=np.float32) #zero center #img = np.multiply(img, 1.0/128.0) #scale [-1,1] #img = np.transpose(state, (1,2,0)) #img = np.array(state) #img = np.transpose(img, (1,2,0)) #print ">>>>>>>>>>>>>", state.shape state = np.reshape(state, state.shape + (1, )) action, control_action = self.selectAction( state, can_be_random=False, use_target=True) nstate, reward, done, info = self.env.step(control_action) total_reward += reward state = nstate else: for step in range(EPISODE_LENGTH): # ACT ============================== epsilon = (float(steps) / float(EPSILON_STEPS)) * ( EPSILON_RANGE[1] - EPSILON_RANGE[0]) + EPSILON_RANGE[0] state = np.reshape(state, state.shape + (1, )) action, control_action = self.selectAction(state, epsilon=epsilon) new_state, reward, done, info = self.env.step( control_action) done = done or (step >= EPISODE_LENGTH) self.memory.addMemory(state, action, reward, new_state, done) state = new_state # LEARN ============================ if ep > PRE_LEARNING_EPISODES: batch, idxs = self.memory.getMiniBatch(BATCH_SIZE) self.learnFromBatch(batch) if done: break # CLEANUP ========================== steps += 1 #we need to consider the episodes without noise to actually tell how the system is doing if play_only and MAKE_PLOT: reward_graph.addSample(total_reward) reward_graph.displayPlot() #calculate fph on total frames total_frames = (ep - PRE_LEARNING_EPISODES) * EPISODE_LENGTH elapsed = time.time() - start_time fps = total_frames / elapsed fph = fps * 3600.0 #re-calculate fps on this episode, so it updates quickly fps = EPISODE_LENGTH / (time.time() - last_ep_time) last_ep_time = time.time() print("fps: " + str(fps) + " fph: " + str(fph) + "\n") #save plot and weights if (ep > 0 and ep % EPISODE_SAVE_FREQUENCY == 0) and not ALREADY_TRAINED: #plot if MAKE_PLOT: reward_graph.savePlot(SAVE_WEIGHTS_PREFIX + "graph_" + str(ep) + ".jpg") #weights self.actor.model.save_weights(SAVE_WEIGHTS_PREFIX + "actor_model_" + str(ep) + ".h5", overwrite=True) self.actor.target_model.save_weights( SAVE_WEIGHTS_PREFIX + "actor_target_model_" + str(ep) + ".h5", overwrite=True) self.critic.model.save_weights( SAVE_WEIGHTS_PREFIX + "critic_model_" + str(ep) + ".h5", overwrite=True) self.critic.target_model.save_weights( SAVE_WEIGHTS_PREFIX + "critic_target_model_" + str(ep) + ".h5", overwrite=True) #network structures (although I don't think I ever actually use these) with open( SAVE_WEIGHTS_PREFIX + "actor_model_" + str(ep) + ".json", "w") as outfile: json.dump(self.actor.model.to_json(), outfile) with open( SAVE_WEIGHTS_PREFIX + "actor_target_model_" + str(ep) + ".json", "w") as outfile: json.dump(self.actor.target_model.to_json(), outfile) with open( SAVE_WEIGHTS_PREFIX + "critic_model_" + str(ep) + ".json", "w") as outfile: json.dump(self.critic.model.to_json(), outfile) with open( SAVE_WEIGHTS_PREFIX + "critic_target_model_" + str(ep) + ".json", "w") as outfile: json.dump(self.critic.target_model.to_json(), outfile) def learnFromBatch(self, miniBatch): dones = np.asarray([sample['isFinal'] for sample in miniBatch]) states = np.asarray([sample['state'] for sample in miniBatch]) actions = np.asarray([sample['action'] for sample in miniBatch]) new_states = np.asarray([sample['newState'] for sample in miniBatch]) Y_batch = np.asarray([sample['reward'] for sample in miniBatch]) new_states = np.reshape(new_states, new_states.shape + (1, )) target_q_values = self.critic.target_model.predict( [new_states, self.actor.target_model.predict(new_states)]) for i in range(len(miniBatch)): if not dones[i]: Y_batch[i] = Y_batch[i] + GAMMA * target_q_values[i] self.critic.model.train_on_batch([states, actions], Y_batch) #additional operations to train actor temp_actions = self.actor.model.predict(states) grads = self.critic.gradients(states, temp_actions) self.actor.train(states, grads) #update target networks self.actor.target_train() self.critic.target_train() ''' This is wrong I think def OU(x, mu, theta, sigma): return theta * (mu - x) + sigma * np.random.randn(1) ''' def clip(self, x, minx, maxx): return max(minx, min(maxx, x)) def selectAction(self, state, can_be_random=True, use_target=False, epsilon=1.0, permutation_num=0): state = np.array([state]) #add dimension to make a "batch" of 1 if use_target: actions = self.actor.target_model.predict(state) else: actions = self.actor.model.predict(state) actions = np.squeeze(actions) #print control_actions #print("+++++++++++") #print(actions) if can_be_random: self.noise.sigma = epsilon noise = self.noise.noise() #print noise i = 0 for idx, a in enumerate(actions): actions[i] = actions[i] + noise[i] actions[i] = self.clip( actions[i], -3.14, 3.14) #need to assign to actions[i], not just a. i += 1 #get noise #noise = [] #iterate over all noise procs for non-coop, or a single agent's procs for co-op #for n in range(permutation_num*ACTIONS_PER_AGENT, permutation_num*ACTIONS_PER_AGENT + self.action_dim): # ou = self.noise_procs[n] # noise.append(ou.step()) # for idx, a in enumerate(actions): # ou = self.noise_procs[0] # noise = ou.step() # a = a + epsilon*noise # #print epsilon * noise # actions[i] = self.clip(a, -3.14, 3.14) #need to assign to actions[i], not just a. # i += 1 # #print(actions) #fill in zeros for all non-learned outputs control_actions = np.pad(actions, (0, self.action_dim - len(actions)), 'constant') #print actions #print control_actions return actions, control_actions #Constructs an image from state vector def constructImageRepresentation(self, state): img = np.empty([IMAGE_SIDE_LENGTH, IMAGE_SIDE_LENGTH], dtype=np.uint8) img.fill(128) color = 255 delta_color = int(math.floor(128 / NUM_TARGETS)) for j in range(NUM_TARGETS): tar = [state[2 * j], state[2 * j + 1]] cv2.circle(img, (int( tar[0] * IMAGE_SIDE_LENGTH), int(tar[1] * IMAGE_SIDE_LENGTH)), 5, 0, -1) cv2.circle(img, (int( tar[0] * IMAGE_SIDE_LENGTH), int(tar[1] * IMAGE_SIDE_LENGTH)), 4, color, -1) color -= delta_color color = 0 for j in range(NUM_AGENTS): offset = 2 * NUM_TARGETS agent = [state[offset + 2 * j], state[offset + 2 * j + 1]] #draw blank agent, no thrust display cv2.rectangle(img, (int(agent[0] * IMAGE_SIDE_LENGTH) - 4, int(agent[1] * IMAGE_SIDE_LENGTH) - 1), (int(agent[0] * IMAGE_SIDE_LENGTH) + 4, int(agent[1] * IMAGE_SIDE_LENGTH) + 1), color, -1) cv2.rectangle(img, (int(agent[0] * IMAGE_SIDE_LENGTH) - 1, int(agent[1] * IMAGE_SIDE_LENGTH) - 4), (int(agent[0] * IMAGE_SIDE_LENGTH) + 1, int(agent[1] * IMAGE_SIDE_LENGTH) + 4), color, -1) #first agent ia 0 since we control it, others are same color color = 64 ''' cv2.namedWindow('perm_image',cv2.WINDOW_NORMAL) cv2.resizeWindow('perm_image', 600,600) cv2.imshow('perm_image', img) cv2.waitKey(1) ''' img = np.array([np.subtract(img, 128)], dtype=np.float32) #zero center img = np.multiply(img, 1.0 / 128.0) #scale [-1,1] img = np.transpose(img, (1, 2, 0)) return img #for co-op case, get an arrangement of the state vector for each agent. def getStatePermutations(self, state): perms = [] for i in range(NUM_AGENTS): if CONVOLUTIONAL and not DRAW_STATE: perms.append(state) else: pstate = [] #copy over target data for j in range(NUM_TARGETS * 2): pstate.append(state[j]) #copy agent data, rotated for j in range(NUM_AGENTS * 2): rot_j = (j + (i * 2)) % (NUM_AGENTS * 2) + (NUM_TARGETS * 2) pstate.append(state[rot_j]) if DRAW_STATE: perms.append(constructImageRepresentation(pstate)) else: perms.append(np.asarray(pstate, dtype=np.float32)) return perms
class RDPG: """docstring for RDPG""" def __init__(self, env): self.name = 'RDPG' # name for uploading results self.environment = env # Randomly initialize actor network and critic network # with both their target networks self.state_dim = env.observation_space.shape[0] self.action_dim = env.action_space.shape[0] self.sess = tf.InteractiveSession() self.actor_network = ActorNetwork(self.sess, self.state_dim, self.action_dim) self.critic_network = CriticNetwork(self.sess, self.state_dim, self.action_dim) # initialize replay buffer self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(self.action_dim) self.saver = tf.train.Saver() def train(self): # Sample a random minibatch of N sequences from replay buffer minibatch = self.replay_buffer.get_batch(BATCH_SIZE) # Construct histories observations = [] next_observations = [] actions = [] rewards = [] dones = [] for each in minibatch: for i in range(1, len(each.observations)): observations.append(self.pad(each.observations[0:i])) next_observations.append(self.pad(each.observations[1, i + 1])) actions.append(each.actions[0:i - 1]) rewards.append(each.rewards[0:i]) if i == len(each.observations) - 1: dones.append(True) else: dones.append(False) # Calculate y_batch next_action_batch = self.actor_network.target_action(observations) q_value_batch = self.critic_network.target_q( next_observations, [self.pad(i + j) for (i, j) in zip(actions, next_action_batch)]) y_batch = [] for i in range(len(observations)): if dones[i]: y_batch.append(rewards[i][-1]) else: y_batch.append(rewards[i][-1] + GAMMA * q_value_batch[i]) y_batch = np.resize(y_batch, [len(observations), 1]) # Update critic by minimizing the loss L self.critic_network.train(y_batch, observations, [self.pad(i) for i in actions]) # Update the actor policy using the sampled gradient: action_batch_for_gradients = self.actor_network.actions(observations) q_gradient_batch = self.critic_network.gradients( observations, action_batch_for_gradients) self.actor_network.train(q_gradient_batch, observations) # Update the target networks self.actor_network.update_target() self.critic_network.update_target() def save_model(self, path, episode): self.saver.save(self.sess, path + "modle.ckpt", episode) def noise_action(self, history): # Select action a_t according to a sequence of observation and action action = self.actor_network.action(history) return action + self.exploration_noise.noise() def action(self, history): action = self.actor_network.action(history) return action def perceive(self, history): # Store the history sequence in the replay buffer self.replay_buffer.add(history) # Store history to replay start size then start training if self.replay_buffer.count() > REPLAY_START_SIZE: self.train() # Re-iniitialize the random process when an episode ends if done: self.exploration_noise.reset() def pad(self, input): dim = len(input[0]) return input + [[0] * dim] * (1000 - len(input))
state_batch = np.random.rand(batch_size, state_dim) # with tf.Session() as sess: # actor = ActorNetwork(sess,state_dim,action_dim,agent_name,1) # print(actor.actions(state_batch)) # actor.update_target() # print('\n') # print(actor.target_actions(state_batch)) # # actor.train(y_grad,state_batch) # actor.update_target() # print(actor.target_actions(state_batch)) # test create multiple agents # agents = [] # with tf.Session() as sess: # for ii in range(10): # agent_name = 'agent'+str(ii) # print(agent_name) # agents.append(ActorNetwork(sess, state_dim, action_dim, agent_name)) # # print(agents) # test the copy works with tf.Session() as sess: agent1 = ActorNetwork(sess,state_dim,action_dim,'agent1') agent1.train(y_grad,state_batch) agent2 = ActorNetwork(sess, state_dim, action_dim, 'agent2', agent1.nets) print('agent 1', agent1.actions(state_batch)) print('agent 2', agent2.actions(state_batch))
class DDPGAgent(): """ Deep deterministic policy gradient agent as described in https://arxiv.org/abs/1509.02971. This agent is meant to operate on low dimensional inputs, not raw pixels. To use the agent, you can get action predictions using act(), and to teach the agent, feed the results to learn. """ def __init__(self, state_size, action_size, num_agents): """ Initialize agent. Params ====== state_size (integer): Size of input state vector action_size (integer): Size of action vector num_agents (integer): Number of simultaneous agents in the environment """ self.state_size = state_size self.action_size = action_size self.num_agents = num_agents # Actor self.local_actor_network = ActorNetwork(state_size, action_size) self.target_actor_network = ActorNetwork(state_size, action_size) self.actor_optimizer = optim.Adam( self.local_actor_network.parameters(), lr=ACTOR_LEARNING_RATE) # Critic self.local_critic_network = CriticNetwork(state_size, action_size) self.target_critic_network = CriticNetwork(state_size, action_size) self.critic_optimizer = optim.Adam( self.local_critic_network.parameters(), lr=CRITIC_LEARNING_RATE, weight_decay=CRITIC_WEIGHT_DECAY) self.replay_buffer = ReplayBuffer(action_size, REPLAY_BUFFER_SIZE, None) self.steps = 0 self.device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") self.random_process = OrnsteinUhlenbeckProcess( (num_agents, action_size), sigma=RANDOM_SIGMA, theta=RANDOM_THETA) def act(self, states, noise=True): """ Returns an action vector based on the current game state. Params ====== states (array_like): A matrix of game states (each row represents the state of an agent) noise (boolean): Add random noise to the predicted action. Aids exploration of the environment during training. """ self.local_actor_network.eval() with torch.no_grad(): actions = self.local_actor_network( torch.tensor(states, dtype=torch.float32)).detach().numpy() self.local_actor_network.train() if noise: actions = actions + self.random_process.sample() actions = np.clip(actions, -1, 1) return actions def vectorize_experiences(self, experiences): """Vectorizes experience objects for use by pytorch Params ====== experiences (array_like of Experience objects): Experiences to vectorize """ states = torch.from_numpy( np.vstack([e.state for e in experiences if e is not None])).float().to(self.device) actions = torch.from_numpy( np.vstack([e.action for e in experiences if e is not None])).float().to(self.device) rewards = torch.from_numpy( np.vstack([e.reward for e in experiences if e is not None])).float().to(self.device) next_states = torch.from_numpy( np.vstack([e.next_state for e in experiences if e is not None])).float().to(self.device) dones = torch.from_numpy( np.vstack([e.done for e in experiences if e is not None ]).astype(np.uint8)).float().to(self.device) return (states, actions, rewards, next_states, dones) def normalize(self, to_normalize): """ Normalize the each row of the input along the 0 dimension using the formula (value - mean)/std Params ====== to_normalize (array_like): Values to normalize """ std = to_normalize.std(0) mean = to_normalize.mean(0) return (to_normalize - mean) / (std + 1e-5) def soft_update(self, target_parameters, local_parameters): """ Updates the given target network parameters with the local parameters using a soft update strategy: tau * local + (1-tau) * target """ for target, local in zip(target_parameters, local_parameters): target.data.copy_(TAU * local.data + (1.0 - TAU) * target.data) def train(self, experiences): """ Trains the actor and critic networks using a minibatch of experiences Params ====== experiences (array_like of Experience): Minibatch of experiences """ states, actions, rewards, next_states, dones = self.vectorize_experiences( experiences) #states = self.normalize(states) #next_states = self.normalize(next_states) rewards = self.normalize(rewards) # Use the target critic network to calculate a target q value next_actions = self.target_actor_network(next_states) q_target = rewards + GAMMA * self.target_critic_network( next_states, next_actions) * (1 - dones) # Calculate the predicted q value q_predicted = self.local_critic_network(states, actions) # Update critic network critic_loss = F.mse_loss(q_predicted, q_target) #print(critic_loss) self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.local_critic_network.parameters(), 1) self.critic_optimizer.step() # Update predicted action using policy gradient actions_predicted = self.local_actor_network(states) #print(self.local_critic_network(states, actions_predicted).mean()) policy_loss = -self.local_critic_network(states, actions_predicted).mean() self.actor_optimizer.zero_grad() policy_loss.backward() #print(policy_loss) self.actor_optimizer.step() self.soft_update(self.target_actor_network.parameters(), self.local_actor_network.parameters()) self.soft_update(self.target_critic_network.parameters(), self.local_critic_network.parameters()) def learn(self, experience): """ Tells the agent to learn from an experience. This may not immediately result in training since this agent uses a replay buffer. Params ====== experience (Experience): An experience used to teach the agent. """ self.replay_buffer.add(experience) self.steps += 1 if self.steps % STEPS_BETWEEN_TRAINING == 0 and len( self.replay_buffer) >= BATCH_SIZE: for i in range(ITERATIONS_PER_TRAINING): self.train(self.replay_buffer.sample(BATCH_SIZE)) def save(self, filename): """Saves learned params of underlying networks to a checkpoint file. Params ====== filename (string): Target file. agent- and critic- are prepended for the agent and critic network, respectively """ torch.save(self.local_actor_network.state_dict(), "actor-" + filename) torch.save(self.local_critic_network.state_dict(), "critic-" + filename) def load(self, filename): """Loads learned params generated by save() into underlying networks. filename (string): Path to file. There should be an agent- and critic- version of this file. """ self.local_actor_network.load_state_dict( torch.load("actor-" + filename)) self.target_actor_network.load_state_dict( torch.load("actor-" + filename)) self.local_critic_network.load_state_dict( torch.load("critic-" + filename)) self.target_critic_network.load_state_dict( torch.load("critic-" + filename)) def end_episode(self): """ Tell the agent that an episode is complete. """ self.random_process.reset() self.steps = 0
class DDPG: def __init__(self, env, replay_buffer, sample_batch, train_iter, gamma, tau, batch_size, n_train, n_episode): # Gym environment self.env = env env_flattened = gym.wrappers.FlattenDictWrapper( env, dict_keys=['observation', 'achieved_goal', 'desired_goal']) # Get space sizes self.state_dim = env_flattened.observation_space.shape[0] #self.state_dim = self.env.observation_space.shape[0] self.action_dim = self.env.action_space.shape[0] # Get replay buffer and function get a batch from it self.replay_buffer = replay_buffer self.sample_batch = sample_batch self.sess = tf.InteractiveSession() # Hyper parameters self.gamma = gamma self.tau = tau self.batch_size = batch_size self.n_train = n_train self.n_episode = n_episode # Initialize networks self.critic = CriticNetwork(self.sess, self.state_dim, self.action_dim) self.actor = ActorNetwork(self.sess, self.state_dim, self.action_dim) self.exploration_noise = OUNoise(self.action_dim) def train(self): batch = self.sample_batch(self.batch_size) state_batch = np.asarray([data[0] for data in batch]) action_batch = np.asarray([data[1] for data in batch]) reward_batch = np.asarray([data[2] for data in batch]) next_state_batch = np.asarray([data[3] for data in batch]) done_batch = np.asarray([data[4] for data in batch]) next_action_batch = self.actor.target_actions(next_state_batch) q_value_batch = self.critic.target_q(next_state_batch, next_action_batch) y_batch = [] for i in range(len(batch)): if done_batch[i]: y_batch.append(reward_batch[i]) else: y_batch.append(reward_batch[i] + self.gamma * q_value_batch[i]) y_batch = np.resize(y_batch, [self.batch_size, 1]) # Update critic by minimizing the loss L self.critic.train(y_batch, state_batch, action_batch) # Update the actor policy using the sampled gradient: action_batch_for_gradients = self.actor.actions(state_batch) q_gradient_batch = self.critic.gradients(state_batch, action_batch_for_gradients) self.actor.train(q_gradient_batch, state_batch) # Update the target networks self.actor.update_target() self.critic.update_target() def noise_action(self, state): action = self.actor.action(state) return action + self.exploration_noise.noise() def action(self, state): return self.actor.action(state) def reset_noise(self): self.exploration_noise.reset() def save_policy(self, save_path): self.actor.save_network(save_path)