def __init__(self, env): # Hyperparameters self.GAMMA = 0.95 self.BATCH_SIZE = 64 self.BUFFER_SIZE = 20000 self.ACTOR_LEARNING_RATE = 0.0001 self.CRITIC_LEARNING_RATE = 0.001 self.TAU = 0.001 self.env = env # get state dimension self.state_dim = env.observation_space.shape[0] # get action dimension self.action_dim = env.action_space.shape[0] # get action bound self.action_bound = env.action_space.high[0] ## create actor and critic networks self.actor = Actor(self.state_dim, self.action_dim, self.action_bound, self.TAU, self.ACTOR_LEARNING_RATE) self.critic = Critic(self.state_dim, self.action_dim, self.TAU, self.CRITIC_LEARNING_RATE) ## initialize replay buffer self.buffer = ReplayBuffer(self.BUFFER_SIZE) # save the results self.save_epi_reward = []
def __init__(self, flags, sess): self.dim_laser = [flags.dim_laser_b, flags.dim_laser_c] self.dim_goal = flags.dim_goal self.dim_action = flags.dim_action self.dim_emb = flags.dim_emb self.dim_cmd = flags.dim_cmd self.n_hidden = flags.n_hidden self.n_cmd_type = flags.n_cmd_type self.n_layers = flags.n_layers self.a_learning_rate = flags.a_learning_rate self.c_learning_rate = flags.c_learning_rate self.batch_size = flags.batch_size self.max_step = flags.max_step self.tau = flags.tau self.action_range = [flags.a_linear_range, flags.a_angular_range] self.buffer_size = flags.buffer_size self.gamma = flags.gamma self.demo_flag = flags.demo_flag self.actor = Actor(sess=sess, dim_laser=self.dim_laser, dim_cmd=self.dim_cmd, dim_action=self.dim_action, dim_goal=self.dim_goal, dim_emb=self.dim_emb, n_cmd_type=self.n_cmd_type, n_hidden=self.n_hidden, n_layers=self.n_layers, max_step=self.max_step, batch_size=self.batch_size, action_range=self.action_range, tau=self.tau, gpu_num=1, demo_flag=self.demo_flag) self.critic = Critic(sess=sess, dim_laser=self.dim_laser, dim_cmd=self.dim_cmd, dim_action=self.dim_action, dim_goal=self.dim_goal, dim_emb=self.dim_emb, n_cmd_type=self.n_cmd_type, n_hidden=self.n_hidden, n_layers=self.n_layers, max_step=self.max_step, batch_size=self.batch_size, num_actor_vars=len(self.actor.network_params) + len(self.actor.target_network_params), tau=self.tau, gpu_num=1) self.memory = []
def __init__(self, env, track, episodes=650): self.env = env self.track = track self.max_episodes = episodes self.max_steps = 3000 self.save_model = True self.load_model = False self.restart_memory_leak = 25 ### size of action- and state space self.state_size = 70 self.action_size = 3 ### DDPG Hyperparameters self.epsilon = 1.0 self.epsilon_decay = 1 / 96000 self.epsilon_min = 0.07 self.batch_size = 64 self.gamma = 0.99 self.tau = 0.001 self.lr_actor = 0.00011 self.lr_critic = 0.0011 ### set OU Process self.ou = OU() ### tf gpu and session set config = tf.ConfigProto() config.gpu_options.allow_growth = True self.sess = tf.Session(config=config) K.set_session(self.sess) ### actor, critic and replay memory self.actor = Actor(self.sess, self.state_size, self.action_size, self.tau, self.lr_actor) self.critic = Critic(self.sess, self.state_size, self.action_size, self.tau, self.lr_critic) self.memory = ExperienceReplayBuffer(50000) ### helper class to build state representation self.dataset_builder = DatasetBuilder()
def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.01 # for soft update of target parameters # Score self.score = 0 self.count = 0 self.best_score = -np.inf
def __init__(self, env): self.sess = tf.Session() K.set_session(self.sess) ## hyperparameters self.GAMMA = 0.95 self.BATCH_SIZE = 128 self.BUFFER_SIZE = 20000 self.MIN_SAMPLES_TO_BEGIN_LEARNING = 1000 self.ACTOR_LEARNING_RATE = 0.001 self.CRITIC_LEARNING_RATE = 0.001 self.TAU = 0.001 self.env = env # get state dimension self.state_dim = env.observation_space.shape[0] # get action dimension self.action_dim = env.action_space.shape[0] # get action bound self.action_bound = env.action_space.high[0] ## create actor and critic networks self.actor = Actor(self.sess, self.state_dim, self.action_dim, self.action_bound, self.TAU, self.ACTOR_LEARNING_RATE) self.critic = Critic(self.sess, self.state_dim, self.action_dim, self.TAU, self.CRITIC_LEARNING_RATE) ## initialize for later gradient calculation self.sess.run( tf.global_variables_initializer()) #<-- no problem without it ## initialize replay buffer self.buffer = ReplayBuffer(self.BUFFER_SIZE) # save the results self.save_epi_reward = []
class DDPGAgent: def __init__(self, env, track, episodes=650): self.env = env self.track = track self.max_episodes = episodes self.max_steps = 3000 self.save_model = True self.load_model = False self.restart_memory_leak = 25 ### size of action- and state space self.state_size = 70 self.action_size = 3 ### DDPG Hyperparameters self.epsilon = 1.0 self.epsilon_decay = 1 / 96000 self.epsilon_min = 0.07 self.batch_size = 64 self.gamma = 0.99 self.tau = 0.001 self.lr_actor = 0.00011 self.lr_critic = 0.0011 ### set OU Process self.ou = OU() ### tf gpu and session set config = tf.ConfigProto() config.gpu_options.allow_growth = True self.sess = tf.Session(config=config) K.set_session(self.sess) ### actor, critic and replay memory self.actor = Actor(self.sess, self.state_size, self.action_size, self.tau, self.lr_actor) self.critic = Critic(self.sess, self.state_size, self.action_size, self.tau, self.lr_critic) self.memory = ExperienceReplayBuffer(50000) ### helper class to build state representation self.dataset_builder = DatasetBuilder() def saveModel(self): self.actor.model.save("./ddpg_weights/ddpg_actor_model.h5") self.critic.model.save("./ddpg_weights/ddpg_critic_model.h5") def lowerExploration(self): if self.epsilon > self.epsilon_min: self.epsilon -= self.epsilon_decay def trainAgent(self): all_total_rewards = [] all_dist_raced = [] all_dist_percentage = [] all_avg_speed = [] all_car_hits = [] all_race_pos = [] for e in range(self.max_episodes): ### save weights every 10th episode if self.save_model: if (e % 10) == 0: self.saveModel() ### relaunch torcs every 10th episode because ### leaky memory would otherwise slow thread down if (e % self.restart_memory_leak) == 0: state = self.env.reset(relaunch=True) else: state = self.env.reset() ### build state representation state, _ = self.dataset_builder.buildStateDataSet(s=state) total_reward = 0 avg_speed = 0 avg_racepos = 0 damage = 0 damage_hit_counter = 0 for j in range(self.max_steps): ### initialize numpy matrices to hold action values with OU noise action_with_noise = np.zeros([1, self.action_size]) noise = np.zeros([1, self.action_size]) ### get action values from actor action = self.actor.model.predict( state.reshape(1, state.shape[0])) ################################################################### ### Deriving OU-Parameters from ### ### https://yanpanlau.github.io/2016/10/11/Torcs-Keras.html ### ### and own experiment ### ################################################################### noise[0][0] = self.epsilon * self.ou.calc_noise( action[0][0], 0.0, 0.55, 0.15) noise[0][1] = self.epsilon * self.ou.calc_noise( action[0][1], 0.55, 1.00, 0.10) noise[0][2] = self.epsilon * self.ou.calc_noise( action[0][2], -0.1, 1.00, 0.05) ################################################################### ### Concept of a "stochastic" break adapted and improved from ### ### https://yanpanlau.github.io/2016/10/11/Torcs-Keras.html ### ### The issue is that slamming the break all the ### ### time isn't adequatly represented in the ### ### reward function. Therefore we "hack" the OU-Process ### ### by triggering the brake with a chance of ### ### min(0.18, self.epsilon) ### ################################################################### if random.random() <= min(0.18, self.epsilon): noise[0][2] = self.epsilon * self.ou.calc_noise( action[0][2], 0.25, 1.00, 0.10) ### Add OU noise to actions action_with_noise[0][0] = action[0][0] + noise[0][0] action_with_noise[0][1] = action[0][1] + noise[0][1] action_with_noise[0][2] = action[0][2] + noise[0][2] next_state, reward, done, info = self.env.step( action_with_noise[0]) ### build state representation dist_raced = next_state.distRaced speedX = next_state.speedX pre_damage = damage damage = next_state.damage racePos = next_state.racePos next_state = np.hstack( (next_state.angle, next_state.track, next_state.focus, next_state.opponents, next_state.trackPos, next_state.speedX, next_state.speedY, next_state.speedZ, next_state.wheelSpinVel / 100.0, next_state.rpm)) ### save to experience replay memory for batch selection self.memory.memorize(state, action_with_noise[0], reward, next_state, done) ### lower epsilon for less exploration self.lowerExploration() ### train the models! self.trainModel() total_reward += reward avg_speed += speedX avg_racepos += racePos state = next_state ### detect damange if damage - pre_damage > 0: damage_hit_counter += 1 print("Episode: " + str(e) + " Step: " + str(j) + " Action: " + str(action_with_noise) + " Reward: " + str(reward) + " Epsilon: " + str(self.epsilon)) if done: all_total_rewards.append(total_reward) all_dist_raced.append(dist_raced) ### use track length according to chosen track if self.track == "eroad": track_length = 3260 elif self.track == "cgspeedway": track_length = 2057 elif self.track == "forza": track_length = 5784 percentage_of_track = round( ((dist_raced / track_length) * 100), 0) ### in case agent completed multiple laps which is likely for a well trained agent if percentage_of_track > 100: percentage_of_track = 100 all_dist_percentage.append(percentage_of_track) all_avg_speed.append((avg_speed / j)) all_car_hits.append(damage_hit_counter) all_race_pos.append(int(avg_racepos / j)) break self.env.end() ### All the plotting stuff print("Plotting rewards!") plt.plot(all_total_rewards) plt.xlabel("Episode") plt.ylabel("Ertrag") plt.show() print("Plotting distances!") plt.plot(all_dist_raced) plt.xlabel("Episode") plt.ylabel("Distanz von Startlinie [m]") plt.show() print("Plotting completeness!") plt.plot(all_dist_percentage) plt.xlabel("Episode") plt.ylabel("Vollstaendigkeit Strecke [%]") plt.axis([0, 350, 0, 100]) plt.show() print("Plotting avg speed!") plt.plot(all_avg_speed) plt.xlabel("Episode") plt.ylabel("Durschn. Geschwindigkeit [km/h]") plt.axis([0, 350, 0, 1]) plt.show() print("Plotting car hits!") plt.plot(all_car_hits) plt.xlabel("Episode") plt.ylabel("Unfaelle des Fahrzeuges") plt.show() print("Mean car hits:") print(sum(all_car_hits) / len(all_car_hits)) print("Std dev car hits:") print(np.std(all_car_hits)) print("Plotting car hits per distance!") div = np.divide(all_car_hits, all_dist_raced) plt.plot(div) plt.xlabel("Episode") plt.ylabel("Unfaelle des Fahrzeuges pro Distanzeinheit") plt.show() print("Plotting avg race pos!") plt.plot(all_race_pos) plt.xlabel("Episode") plt.ylabel("Durschn. Position") plt.show() def trainModel(self): ### get random mini batch from experience replay memory mini_batch = self.memory.sampleRandomBatch(self.batch_size) ### build arrays for models from mini batch states = np.asarray([b[0] for b in mini_batch]) actions = np.asarray([b[1] for b in mini_batch]) target = np.asarray([b[1] for b in mini_batch]) rewards = np.asarray([b[2] for b in mini_batch]) new_states = np.asarray([b[3] for b in mini_batch]) dones = np.asarray([b[4] for b in mini_batch]) ### get q values from target critic model ### q(s, t(s), w') in thesis target_q_values = self.critic.target_model.predict( [new_states, self.actor.target_model.predict(new_states)]) ### iterate through minibatch, update target according to bellman eq. for k in range(0, len(mini_batch)): if dones[k]: target[k] = rewards[k] else: target[k] = rewards[k] + self.gamma * target_q_values[k] ### train networks self.critic.model.train_on_batch([states, actions], target) actions = self.actor.model.predict(states) ### nabla q(s, t(s)) gradients = self.critic.gradients(states, actions) ### train actor self.actor.train(states, gradients) ### soft update self.actor.target_train() self.critic.target_train() def testAgent(self): ### set epsilon (exploration) low self.epsilon = self.epsilon_min ### Do not save weights when testing ### CHANGE if you want to continuously train agent self.save_model = False try: self.actor.model = load_model("./ddpg_weights/ddpg_actor_model.h5") self.critic.model = load_model( "./ddpg_weights/ddpg_critic_model.h5") print("Model loaded!") except: print("Model could not be loaded! Check path or train first") sys.exit() self.trainAgent()
class DDPG(object): """docstring for DDPG""" def __init__(self, flags, sess): self.dim_laser = [flags.dim_laser_b, flags.dim_laser_c] self.dim_goal = flags.dim_goal self.dim_action = flags.dim_action self.dim_emb = flags.dim_emb self.dim_cmd = flags.dim_cmd self.n_hidden = flags.n_hidden self.n_cmd_type = flags.n_cmd_type self.n_layers = flags.n_layers self.a_learning_rate = flags.a_learning_rate self.c_learning_rate = flags.c_learning_rate self.batch_size = flags.batch_size self.max_step = flags.max_step self.tau = flags.tau self.action_range = [flags.a_linear_range, flags.a_angular_range] self.buffer_size = flags.buffer_size self.gamma = flags.gamma self.demo_flag = flags.demo_flag self.actor = Actor(sess=sess, dim_laser=self.dim_laser, dim_cmd=self.dim_cmd, dim_action=self.dim_action, dim_goal=self.dim_goal, dim_emb=self.dim_emb, n_cmd_type=self.n_cmd_type, n_hidden=self.n_hidden, n_layers=self.n_layers, max_step=self.max_step, batch_size=self.batch_size, action_range=self.action_range, tau=self.tau, gpu_num=1, demo_flag=self.demo_flag) self.critic = Critic(sess=sess, dim_laser=self.dim_laser, dim_cmd=self.dim_cmd, dim_action=self.dim_action, dim_goal=self.dim_goal, dim_emb=self.dim_emb, n_cmd_type=self.n_cmd_type, n_hidden=self.n_hidden, n_layers=self.n_layers, max_step=self.max_step, batch_size=self.batch_size, num_actor_vars=len(self.actor.network_params) + len(self.actor.target_network_params), tau=self.tau, gpu_num=1) self.memory = [] def ActorPredict(self, input_laser, input_cmd, input_cmd_next, input_cmd_skip, prev_action, input_goal, prev_state_2): a, state_2 = self.actor.PredictOnline(input_laser, input_cmd, input_cmd_next, input_cmd_skip, prev_action, input_goal, prev_state_2) return a[0], state_2 def Add2Mem(self, sample): if len(sample) <= self.max_step: self.memory.append( sample) # seqs of (laser, cmd, cmd_next, cmd_skip, # prev_action, obj_goal, action, # r, terminate, status, action_label) if len(self.memory) > self.buffer_size: self.memory.pop(0) def SampleBatch(self): if len(self.memory) >= self.batch_size: indices = np.random.randint(0, len(self.memory) - 1, size=self.batch_size) laser_t_batch = np.empty( (self.batch_size, self.dim_laser[0], self.dim_laser[1]), dtype=np.float32) cmd_t_batch = np.empty((self.batch_size, self.dim_cmd), dtype=np.int64) cmd_next_t_batch = np.empty((self.batch_size, self.dim_cmd), dtype=np.int64) cmd_skip_t_batch = np.empty((self.batch_size, self.dim_cmd), dtype=np.int64) prev_action_t_batch = np.empty((self.batch_size, self.dim_action), dtype=np.float32) goal_t_batch = np.empty((self.batch_size, self.dim_goal), dtype=np.float32) goal_a_t_batch = np.empty((self.batch_size, self.dim_goal), dtype=np.float32) prev_state_2_t_batch = [ np.empty((self.batch_size, self.n_hidden), dtype=np.float32), np.empty((self.batch_size, self.n_hidden), dtype=np.float32) ] action_t_batch = np.empty((self.batch_size, self.dim_action), dtype=np.float32) reward_batch = np.empty((self.batch_size), dtype=np.float32) terminate_batch = np.empty((self.batch_size), dtype=bool) status_batch = np.empty((self.batch_size, 1), dtype=np.int64) action_batch = np.empty((self.batch_size, self.dim_action), dtype=np.float32) laser_t1_batch = np.empty( (self.batch_size, self.dim_laser[0], self.dim_laser[1]), dtype=np.float32) cmd_t1_batch = np.empty((self.batch_size, self.dim_cmd), dtype=np.int64) cmd_next_t1_batch = np.empty((self.batch_size, self.dim_cmd), dtype=np.int64) cmd_skip_t1_batch = np.empty((self.batch_size, self.dim_cmd), dtype=np.int64) prev_action_t1_batch = np.empty((self.batch_size, self.dim_action), dtype=np.float32) goal_t1_batch = np.empty((self.batch_size, self.dim_goal), dtype=np.float32) goal_a_t1_batch = np.empty((self.batch_size, self.dim_goal), dtype=np.float32) prev_state_2_t1_batch = [ np.empty((self.batch_size, self.n_hidden), dtype=np.float32), np.empty((self.batch_size, self.n_hidden), dtype=np.float32) ] action_t1_batch = np.empty((self.batch_size, self.dim_action), dtype=np.float32) for i, idx in enumerate(indices): laser_t_batch[i] = self.memory[idx][0] cmd_t_batch[i] = self.memory[idx][1] cmd_next_t_batch[i] = self.memory[idx][2] cmd_skip_t_batch[i] = self.memory[idx][3] prev_action_t_batch[i] = self.memory[idx][4] goal_t_batch[i] = self.memory[idx][5] prev_state_2_t_batch[0][i] = self.memory[idx][6][0][0] prev_state_2_t_batch[1][i] = self.memory[idx][6][1][0] action_t_batch[i] = self.memory[idx][7] reward_batch[i] = self.memory[idx][8] terminate_batch[i] = self.memory[idx][9] status_batch[i] = self.memory[idx][10] action_batch[i] = self.memory[idx][11] laser_t1_batch[i] = self.memory[idx + 1][0] cmd_t1_batch[i] = self.memory[idx + 1][1] cmd_next_t1_batch[i] = self.memory[idx + 1][2] # prev_action_t1_batch[i] = self.memory[idx+1][4] goal_t1_batch[i] = self.memory[idx + 1][5] prev_state_2_t1_batch[0][i] = self.memory[idx + 1][6][0][0] prev_state_2_t1_batch[1][i] = self.memory[idx + 1][6][1][0] action_t1_batch[i] = self.memory[idx + 1][8] if cmd_t_batch[i] == 5: goal_a_t_batch[i] = self.memory[idx][5] else: goal_a_t_batch[i] = [0., 0.] if cmd_t1_batch[i] == 5: goal_a_t1_batch[i] = self.memory[idx + 1][5] else: goal_a_t1_batch[i] = [0., 0.] return [ laser_t_batch, cmd_t_batch, cmd_next_t_batch, cmd_skip_t_batch, prev_action_t_batch, goal_a_t_batch, goal_a_t_batch, prev_state_2_t_batch, action_t_batch, reward_batch, terminate_batch, status_batch, action_batch, laser_t1_batch, cmd_t1_batch, cmd_next_t1_batch, action_t_batch, goal_a_t1_batch, goal_a_t1_batch, action_t1_batch ], indices else: print 'samples are not enough' return None, None def Train(self): start_time = time.time() batch, indices = self.SampleBatch() sample_time = time.time() - start_time if not batch: return 0. else: [ laser_t_batch, cmd_t_batch, cmd_next_t_batch, cmd_skip_t_batch, prev_action_t_batch, goal_t_batch, goal_a_t_batch, prev_state_2_t_batch, action_t_batch, reward_batch, terminate_batch, status_batch, action_batch, laser_t1_batch, cmd_t1_batch, cmd_next_t1_batch, prev_action_t1_batch, goal_t1_batch, goal_a_t1_batch, action_t1_batch ] = batch #compute target y target_a_pred = self.actor.PredictTarget( laser=laser_t1_batch, cmd=cmd_t1_batch, cmd_next=cmd_next_t1_batch, prev_action=prev_action_t1_batch, obj_goal=goal_a_t1_batch) target_q_pred = self.critic.PredictTarget( laser=laser_t1_batch, cmd=cmd_t1_batch, cmd_next=cmd_next_t1_batch, prev_action=prev_action_t1_batch, obj_goal=goal_t1_batch, action=action_t1_batch) y = [] for i in xrange(self.batch_size): if terminate_batch[i]: y.append(reward_batch[i]) else: y.append(reward_batch[i] + self.gamma * target_q_pred[i, 0]) y = np.expand_dims(np.stack(y), axis=1) y_time = time.time() - start_time - sample_time # critic update q, _ = self.critic.Train(laser=laser_t_batch, cmd=cmd_t_batch, cmd_next=cmd_next_t_batch, prev_action=prev_action_t_batch, obj_goal=goal_t_batch, action=action_t_batch, y=y) # actions for a_gradients from critic actions, states_2 = self.actor.PredictOnline( laser=laser_t_batch, cmd=cmd_t_batch, cmd_next=cmd_next_t_batch, cmd_skip=cmd_skip_t_batch, prev_action=prev_action_t_batch, obj_goal=goal_a_t_batch, prev_state_2=prev_state_2_t_batch) # a_gradients a_gradients = self.critic.ActionGradients( laser=laser_t_batch, cmd=cmd_t_batch, cmd_next=cmd_next_t_batch, prev_action=prev_action_t_batch, obj_goal=goal_t_batch, action=actions) # actor update self.actor.Train(laser=laser_t_batch, cmd=cmd_t_batch, cmd_next=cmd_next_t_batch, cmd_skip=cmd_skip_t_batch, prev_action=prev_action_t_batch, obj_goal=goal_a_t_batch, prev_state_2=prev_state_2_t_batch, a_gradient=a_gradients[0], status_label=status_batch, action_label=action_batch) train_time = time.time() - start_time - sample_time - y_time # target networks update self.critic.UpdateTarget() self.actor.UpdateTarget() target_time = time.time( ) - start_time - sample_time - y_time - train_time # print 'sample_time:{:.3f}, y_time:{:.3f}, train_time:{:.3f}, target_time:{:.3f}'.format(sample_time, # y_time, # train_time, # target_time) return q
class DDPGagent(object): def __init__(self, env): self.sess = tf.Session() K.set_session(self.sess) ## hyperparameters self.GAMMA = 0.95 self.BATCH_SIZE = 64 self.BUFFER_SIZE = 20000 self.ACTOR_LEARNING_RATE = 0.0001 self.CRITIC_LEARNING_RATE = 0.001 self.TAU = 0.001 self.env = env # get state dimension self.state_dim = env.observation_space.shape[0] # get action dimension self.action_dim = env.action_space.shape[0] # get action bound self.action_bound = env.action_space.high[0] ## create actor and critic networks self.actor = Actor(self.sess, self.state_dim, self.action_dim, self.action_bound, self.TAU, self.ACTOR_LEARNING_RATE) self.critic = Critic(self.sess, self.state_dim, self.action_dim, self.TAU, self.CRITIC_LEARNING_RATE) ## initialize for later gradient calculation self.sess.run( tf.global_variables_initializer()) #<-- no problem without it ## initialize replay buffer self.buffer = ReplayBuffer(self.BUFFER_SIZE) # save the results self.save_epi_reward = [] ## Ornstein Uhlenbeck Noise def ou_noise(self, x, rho=0.15, mu=0, dt=1e-1, sigma=0.2, dim=1): return x + rho * ( mu - x) * dt + sigma * np.sqrt(dt) * np.random.normal(size=dim) ## computing TD target: y_k = r_k + gamma*Q(s_k+1, a_k+1) def td_target(self, rewards, q_values, dones): y_k = np.asarray(q_values) for i in range(q_values.shape[0]): # number of batch if dones[i]: y_k[i] = rewards[i] else: y_k[i] = rewards[i] + self.GAMMA * q_values[i] return y_k ## train the agent def train(self, max_episode_num): # initial transfer model weights to target model network self.actor.update_target_network() self.critic.update_target_network() for ep in range(int(max_episode_num)): # reset OU noise pre_noise = np.zeros(self.action_dim) # reset episode time, episode_reward, done = 0, 0, False # reset the environment and observe the first state state = self.env.reset() while not done: # visualize the environment #self.env.render() # pick an action: shape = (1,) action = self.actor.predict(state) noise = self.ou_noise(pre_noise, dim=self.action_dim) # clip continuous action to be within action_bound action = np.clip(action + noise, -self.action_bound, self.action_bound) # observe reward, new_state next_state, reward, done, _ = self.env.step(action) # add transition to replay buffer train_reward = (reward + 8) / 8 self.buffer.add_buffer(state, action, train_reward, next_state, done) if self.buffer.buffer_size > 1000: # start train after buffer has some amounts # sample transitions from replay buffer states, actions, rewards, next_states, dones = self.buffer.sample_batch( self.BATCH_SIZE) # predict target Q-values target_qs = self.critic.target_predict( [next_states, self.actor.target_predict(next_states)]) # compute TD targets y_i = self.td_target(rewards, target_qs, dones) # train critic using sampled batch self.critic.train_on_batch(states, actions, y_i) # Q gradient wrt current policy s_actions = self.actor.model.predict( states) # shape=(batch, 1), # caution: NOT self.actor.predict ! # self.actor.model.predict(state) -> shape=(1,1) # self.actor.predict(state) -> shape=(1,) -> type of gym action s_grads = self.critic.dq_da(states, s_actions) dq_das = np.array(s_grads).reshape((-1, self.action_dim)) # train actor self.actor.train(states, dq_das) # update both target network self.actor.update_target_network() self.critic.update_target_network() # update current state pre_noise = noise state = next_state episode_reward += reward time += 1 ## display rewards every episode print('Episode: ', ep + 1, 'Time: ', time, 'Reward: ', episode_reward) self.save_epi_reward.append(episode_reward) ## save weights every episode #print('Now save') self.actor.save_weights("./save_weights/pendulum_actor.h5") self.critic.save_weights("./save_weights/pendulum_critic.h5") np.savetxt('./save_weights/pendulum_epi_reward.txt', self.save_epi_reward) print(self.save_epi_reward) ## save them to file if done def plot_result(self): plt.plot(self.save_epi_reward) plt.show()
class DDPG_Agent(): """Reinforcement Learning agent that learns using DDPG.""" def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.01 # for soft update of target parameters # Score self.score = 0 self.count = 0 self.best_score = -np.inf def reset_episode(self): self.noise.reset() state = self.task.reset() self.last_state = state self.total_reward = 0 self.count = 0 return state def step(self, action, reward, next_state, done): # Save experience / reward self.memory.add(self.last_state, action, reward, next_state, done) # Learn, if enough samples are available in memory if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) # Roll over last state and action self.last_state = next_state self.total_reward += reward self.count += 1 def act(self, state): """Returns actions for given state(s) as per current policy.""" state = np.reshape(state, [-1, self.state_size]) action = self.actor_local.model.predict(state)[0] return list(action + self.noise.sample()) # add some noise for exploration def learn(self, experiences): """Update policy and value parameters using given batch of experience tuples.""" # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.) states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape( -1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None ]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack( [e.next_state for e in experiences if e is not None]) # Get predicted next-state actions and Q values from target models # Q_targets_next = critic_target(next_state, actor_target(next_state)) actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch( [next_states, actions_next]) # Compute Q targets for current states and train critic model (local) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) # Train actor model (local) action_gradients = np.reshape( self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) # custom training function # Soft-update target models self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) if self.count > 0: self.score = self.total_reward / float(self.count) if self.score > self.best_score: self.best_score = self.score else: self.score = 0 def soft_update(self, local_model, target_model): """Soft update model parameters.""" local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len( target_weights ), "Local and target model parameters must have the same size" new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights)