def main(): """main method log runtime and print it at the end """ s_time = timeit.default_timer() global iteration env = TorcsEnv(vision=False, throttle=True, gear_change=False) memory = ReplayBuffer() epsilon = 1 train_indicator = True modelPATH = os.path.join('.',"models",'E0011.pt') q,q_target = QNet(state_dim,action_dim),QNet(state_dim,action_dim) q_target.load_state_dict(q.state_dict()) mu, mu_target = MuNet(state_dim), MuNet(state_dim) mu_target.load_state_dict(mu.state_dict()) steer_noise = OUN(np.zeros(1),theta = 0.6) accel_noise = OUN(np.zeros(1),theta = 0.6) mu_optimizer = optim.Adam(mu.parameters(), lr=lr_mu) q_optimizer = optim.Adam(q.parameters(), lr=lr_q) #tensorboard writer current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") log_dir = os.path.join("logs", "ddpg_torch", current_time+'E0011t') writer = SummaryWriter(log_dir) samplestate = torch.rand(1,29) sampleaction = torch.rand(1,2) #writer.add_graph(mu,samplestate) writer.add_graph(q,(samplestate,sampleaction)) writer.close if train_indicator ==False: mu = torch.load(modelPATH) mu.eval() ob = env.reset() score = 0 for n_step in range(100000): s_t = np.hstack((ob.angle, ob.track,ob.trackPos,ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm)) a_t = mu(torch.from_numpy(s_t.reshape(1,-1)).float()).detach().numpy() ob,r_t,done,_ = env.step(a_t[0]) score += r_t if done: print("score:",score) break env.end() return 0 for n_epi in range(max_episode): print("Episode : " + str(n_epi) + " Replay Buffer " + str(memory.size())) if np.mod(n_epi, 3) == 0: ob = env.reset(relaunch=True) #relaunch TORCS every 3 episode because of the memory leak error else: ob = env.reset() a_t = np.zeros([1,action_dim]) s_t = np.hstack((ob.angle, ob.track,ob.trackPos,ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm)) score = 0 q_value_writer(q, mu, s_t, writer, 'Episode Start Q value') q_value_writer(q_target, mu_target, s_t, writer, 'Episode Start target Q value') #t_start = timeit.default_timer() for n_step in range(max_step): #epsilon -= 1.0/EXPLORE a_origin = mu(torch.from_numpy(s_t.reshape(1,-1)).float()) if train_indicator == True:#add noise for train # sn = max(epsilon,0)*steer_noise() sn = steer_noise() # an = max(epsilon,0)*accel_noise() an = accel_noise() a_s = a_origin.detach().numpy()[0][0] + sn a_t[0][0] = np.clip(a_s,-1,1) # fit in steer arange a_a = a_origin.detach().numpy()[0][1] + an a_t[0][1] = np.clip(a_a,0,1) # fit in accel arange #record noise movement if iteration%10==0: writer.add_scalar('Steer noise', sn, iteration) writer.add_scalar('Accel_noise', an, iteration) else: a_t = a_origin.detatch().numpy() ob,r_t,done,_ = env.step(a_t[0]) score += r_t s_t1 = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm)) memory.put((s_t,a_t[0],r_t,s_t1,done)) s_temp = copy.deepcopy(s_t) # for end q value log s_t = s_t1 if train_indicator and memory.size()>train_start_size: train(mu, mu_target, q, q_target, memory, q_optimizer, mu_optimizer,writer) soft_update(mu, mu_target) soft_update(q, q_target) iteration+=1 if done: q_value_writer(q,mu,s_temp,writer,'Episode End Q value') q_value_writer(q_target,mu_target,s_temp,writer,'Episode End target Q value') break #t_end = timeit.default_timer() print("TOTAL REWARD @ " + str(n_epi) +"-th Episode : Reward " + str(score)) print("Total Step: " + str(n_step)) print("") #print('{}steps, {} time spent'.format(i,t_end-t_start)) torch.save(mu,modelPATH) env.end() e_time = timeit.default_timer() print("Total step {} and time spent {}".format(iteration, e_time-s_time))
class DDPG: def __init__(self, env=gym.make('Pendulum-v0'), s_dim=2, a_dim=1, gamma=0.99, episodes=100, tau=0.001, buffer_size=1e06, minibatch_size=64, actor_lr=0.001, critic_lr=0.001, save_name='final_weights', render=False): self.save_name = save_name self.render = render self.env = env self.upper_bound = env.action_space.high[0] self.lower_bound = env.action_space.low[0] self.EPISODES = episodes self.MAX_TIME_STEPS = 200 self.s_dim = s_dim self.a_dim = a_dim self.GAMMA = gamma self.TAU = tau self.buffer_size = buffer_size self.minibatch_size = minibatch_size self.actor_lr = actor_lr self.critic_lr = critic_lr self.ou_noise = OUNoise(mean=np.zeros(1)) self.actor = Actor(self.s_dim, self.a_dim).model() self.target_actor = Actor(self.s_dim, self.a_dim).model() self.actor_opt = tf.keras.optimizers.Adam(learning_rate=self.actor_lr) self.target_actor.set_weights(self.actor.get_weights()) self.critic = Critic(self.s_dim, self.a_dim).model() self.critic_opt = tf.keras.optimizers.Adam( learning_rate=self.critic_lr) self.target_critic = Critic(self.s_dim, self.a_dim).model() self.target_critic.set_weights(self.critic.get_weights()) self.replay_buffer = ReplayBuffer(self.buffer_size) def update_target(self): # Two methods to update the target actor # Method 1: self.target_actor.set_weights( np.array(self.actor.get_weights()) * self.TAU + np.array(self.target_actor.get_weights()) * (1 - self.TAU)) self.target_critic.set_weights( np.array(self.critic.get_weights()) * self.TAU + np.array(self.target_critic.get_weights()) * (1 - self.TAU)) """ # Method 2: new_weights = [] target_variables = self.target_critic.weights for i, variable in enumerate(self.critic.weights): new_weights.append(variable * self.TAU + target_variables[i] * (1 - self.TAU)) self.target_critic.set_weights(new_weights) new_weights = [] target_variables = self.target_actor.weights for i, variable in enumerate(self.actor.weights): new_weights.append(variable * self.TAU + target_variables[i] * (1 - self.TAU)) self.target_actor.set_weights(new_weights) """ def train_step(self): s_batch, a_batch, r_batch, d_batch, s2_batch = self.replay_buffer.sample_batch( self.minibatch_size) """ mu_prime = self.target_actor(s2_batch) # predictions by target actor Q_prime = self.target_critic([s2_batch, mu_prime]) # predictions by target critic y = np.zeros_like(Q_prime) for k in range(self.minibatch_size): if d_batch[k]: y[k] = r_batch[k] else: y[k] = r_batch[k] + self.GAMMA * Q_prime[k] # y = r_batch + gamma * Q_prime checkpoint_path = "training/cp_critic.ckpt" checkpoint_dir = os.path.dirname(checkpoint_path) # Create a callback that saves the model's weights cp_callback1 = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_dir, save_weights_only=True, verbose=1) self.critic.train_on_batch([s_batch, a_batch], y) # self.critic.fit([s_batch, a_batch], y, verbose=0, steps_per_epoch=8, callbacks=[cp_callback1]) with tf.GradientTape(persistent=True) as tape: a = self.actor(s_batch) tape.watch(a) theta = self.actor.trainable_variables q = self.critic([s_batch, a]) dq_da = tape.gradient(q, a) da_dtheta = tape.gradient(a, theta, output_gradients=-dq_da) self.actor_opt.apply_gradients(zip(da_dtheta, self.actor.trainable_variables)) """ with tf.GradientTape() as tape: target_actions = self.target_actor(s2_batch) y = r_batch + self.GAMMA * self.target_critic( [s2_batch, target_actions]) critic_value = self.critic([s_batch, a_batch]) critic_loss = tf.math.reduce_mean(tf.math.square(y - critic_value)) critic_grad = tape.gradient(critic_loss, self.critic.trainable_variables) self.critic_opt.apply_gradients( zip(critic_grad, self.critic.trainable_variables)) with tf.GradientTape() as tape: actions = self.actor(s_batch) q = self.critic([s_batch, actions]) # critic_value # Used `-value` as we want to maximize the value given # by the critic for our actions actor_loss = -tf.math.reduce_mean(q) actor_grad = tape.gradient(actor_loss, self.actor.trainable_variables) self.actor_opt.apply_gradients( zip(actor_grad, self.actor.trainable_variables)) self.update_target() return np.mean(q) def policy(self, s): # since batch normalization is done on self.actor, it is multiplied with upper_bound if s.ndim == 1: s = s[None, :] action = self.actor(s) * self.upper_bound + self.ou_noise() action = np.clip(action, self.lower_bound, self.upper_bound) return action def train(self): # To store reward history of each episode ep_reward_list = [] # To store average reward history of last few episodes avg_reward_list = [] monitor = Monitor([1, 1], titles=['Reward', 'Loss'], log=2) with Loop_handler( ) as interruption: # to properly save even if ctrl+C is pressed for eps in range(self.EPISODES): episode_reward = 0 s = self.env.reset() """ if an env is created using the "gym.make" method, it will terminate after 200 steps """ for t in range(self.MAX_TIME_STEPS): # done = False # while not done: if self.render: self.env.render() a = self.policy(s) s_, r, done, _ = self.env.step(a) self.replay_buffer.add(np.reshape(s, (self.s_dim, )), np.reshape(a, (self.a_dim, )), r, done, np.reshape(s_, (self.s_dim, ))) episode_reward += r if self.replay_buffer.size() > self.minibatch_size: q = self.train_step() s = s_.reshape(1, -1) if interruption(): break ep_reward_list.append(episode_reward) # Mean of last 40 episodes avg_reward = np.mean(ep_reward_list[-40:]) print("Episode * {} * Avg Reward is ==> {}".format( eps, avg_reward)) avg_reward_list.append(avg_reward) monitor.add_data(avg_reward, q) self.save_weights( save_name=self.save_name) # if you want to save weights self.plot_results(avg_reward=avg_reward_list, train=True) def save_weights(self, save_name='final_weights'): self.actor.save_weights("training/%s_actor.h5" % save_name) self.critic.save_weights("training/%s_critic.h5" % save_name) self.target_actor.save_weights("training/%s_target_actor.h5" % save_name) self.target_critic.save_weights("training/%s_target_critic.h5" % save_name) # to save in other format self.target_actor.save_weights('training/%s_actor_weights' % save_name, save_format='tf') self.target_critic.save_weights('training/%s_critic_weights' % save_name, save_format='tf') print('Training completed and network weights saved') # For evaluation of the policy learned def collect_data(self, act_net, iterations=1000): a_all, states_all = [], [] obs = self.env.reset() for t in range(iterations): obs = np.squeeze(obs) if obs.ndim == 1: a = act_net(obs[None, :]) else: a = act_net(obs) obs, _, done, _ = self.env.step(a) states_all.append(obs) a_all.append(a) # self.env.render() # Uncomment this to see the actor in action (But not in python notebook) # if done: # break states = np.squeeze( np.array(states_all)) # cos(theta), sin(theta), theta_dot a_all = np.squeeze(np.array(a_all)) return states, a_all def plot_results(self, avg_reward=None, actions=None, states=None, train=False, title=None): # An additional way to visualize the avg episode rewards if train: plt.figure() plt.plot(avg_reward) plt.xlabel("Episode") plt.ylabel("Avg. Epsiodic Reward") plt.show() else: # work only for Pendulum-v0 environment fig, ax = plt.subplots(3, sharex=True) theta = np.arctan2(states[:, 1], states[:, 0]) ax[0].set_ylabel('u') ax[0].plot(np.squeeze(actions)) ax[1].set_ylabel(u'$\\theta$') ax[1].plot(theta) # ax[1].plot(states[:, 0]) ax[2].set_ylabel(u'$\omega$') ax[2].plot(states[:, 2]) # ang velocity fig.canvas.set_window_title(title)
def train(sess, env, actor, critic): # Set up summary ops summary_ops, summary_vars = build_summaries() # Initialize Tensorflow variables sess.run(tf.global_variables_initializer()) writer = tf.summary.FileWriter(SUMMARY_DIR, sess.graph) # Initialize target network weights actor.update_target_network() critic.update_target_network() # Initialize replay memory replay_buffer = ReplayBuffer(BUFFER_SIZE, RANDOM_SEED) for i in xrange(MAX_EPISODES): s = env.reset() episode_reward = 0 episode_ave_max_q = 0 noise = ExplorationNoise.ou_noise(OU_THETA, OU_MU, OU_SIGMA, MAX_STEPS_EPISODE) noise = ExplorationNoise.exp_decay(noise, EXPLORATION_TIME) for j in xrange(MAX_STEPS_EPISODE): if RENDER_ENV: env.render() # Add exploratory noise according to Ornstein-Uhlenbeck process to action # Decay exploration exponentially from 1 to 0 in EXPLORATION_TIME steps if i < EXPLORATION_TIME: a = actor.predict( np.reshape(s, (1, env.observation_space.shape[0]))) + noise[j] else: a = actor.predict( np.reshape(s, (1, env.observation_space.shape[0]))) s2, r, terminal, info = env.step(a[0]) replay_buffer.add(np.reshape(s, actor.state_dim), np.reshape(a, actor.action_dim), r, terminal, np.reshape(s2, actor.state_dim)) # Keep adding experience to the memory until # there are at least minibatch size samples if replay_buffer.size() > MINIBATCH_SIZE: s_batch, a_batch, r_batch, t_batch, s2_batch = \ replay_buffer.sample_batch(MINIBATCH_SIZE) # Calculate targets target_q = critic.predict_target( s2_batch, actor.predict_target(s2_batch)) y_i = [] for k in xrange(MINIBATCH_SIZE): # If state is terminal assign reward only if t_batch[k]: y_i.append(r_batch[k]) # Else assgin reward + net target Q else: y_i.append(r_batch[k] + GAMMA * target_q[k]) # Update the critic given the targets predicted_q_value, _ = \ critic.train(s_batch, a_batch, np.reshape(y_i, (MINIBATCH_SIZE, 1))) episode_ave_max_q += np.amax(predicted_q_value) # Update the actor policy using the sampled gradient a_outs = actor.predict(s_batch) a_grads = critic.action_gradients(s_batch, a_outs) actor.train(s_batch, a_grads[0]) # Update target networks actor.update_target_network() critic.update_target_network() s = s2 episode_reward += r if terminal or j == MAX_STEPS_EPISODE - 1: summary_str = sess.run(summary_ops, feed_dict={ summary_vars[0]: episode_reward, summary_vars[1]: episode_ave_max_q }) writer.add_summary(summary_str, i) writer.flush() print 'Reward: %.2i' % int(episode_reward), ' | Episode', i, \ '| Qmax: %.4f' % (episode_ave_max_q / float(j)) break
class DDPG: def __init__(self, env, sess, low_action_bound_list, high_action_bound_list): self.env = env self.sess = sess self.low_action_bound_list = low_action_bound_list # depends on the env self.high_action_bound_list = high_action_bound_list self.action_range_bound = [ hi - lo for hi, lo in zip(self.high_action_bound_list, self.low_action_bound_list) ] self.learning_rate = 0.0001 #TODO move these to configs self.epsilon = 1.0 self.epsilon_min = 0.1 self.epsilon_decay = 1e-6 self.gamma = 0.99 self.tau = 0.001 self.buffer_size = 1000000 self.batch_size = 128 self.theta = 0.15 self.ou = 0 self.sigma = 0.3 self.state_dim = self.env.observation_space.shape[0] self.action_dim = len(self.low_action_bound_list ) #self.env.action_space, make this into input self.continuous_action_space = True # Initialize replay buffer self.replay_buffer = ReplayBuffer(self.buffer_size) # Creating ACTOR model actor_ = Actor(self.state_dim, self.action_dim, self.learning_rate) self.actor_state_input, self.actor_model = actor_.create_actor_model() _, self.target_actor_model = actor_.create_actor_model() self.actor_critic_grad = tf.placeholder(tf.float32, [None, self.action_dim]) actor_model_weights = self.actor_model.trainable_weights self.actor_grads = tf.gradients(self.actor_model.output, actor_model_weights, -self.actor_critic_grad) grads = zip(self.actor_grads, actor_model_weights) self.optimize = tf.train.AdamOptimizer( self.learning_rate).apply_gradients(grads) # Creating CRITIC model critic_ = Critic(self.state_dim, self.action_dim, self.learning_rate) self.critic_state_input, self.critic_action_input, self.critic_model = critic_.create_critic_model( ) _, _, self.target_critic_model = critic_.create_critic_model() self.critic_grads = tf.gradients(self.critic_model.output, self.critic_action_input) self.noise = OrnsteinUhlenbeckProcess(size=self.action_dim) self.noise.reset() self.sess.run(tf.initialize_all_variables()) def __repr__(self): return 'DDPG_gamma{}_tau{}'.format(self.gamma, self.tau) # TRAINING FUNCTIONS def train_actor(self, samples): current_states, actions, rewards, next_states, dones = samples predicted_actions = self.actor_model.predict(current_states) grads = self.sess.run(self.critic_grads, feed_dict={ self.critic_state_input: current_states, self.critic_action_input: predicted_actions })[0] self.sess.run(self.optimize, feed_dict={ self.actor_state_input: current_states, self.actor_critic_grad: grads }) if self.epsilon - self.epsilon_decay > self.epsilon_min: self.epsilon -= self.epsilon_decay self.noise.reset() def train_critic(self, samples): current_states, actions, rewards, next_states, dones = samples target_actions = self.target_actor_model.predict(next_states) target_q_values = self.target_critic_model.predict( [next_states, target_actions]) rewards = rewards + self.gamma * target_q_values * (1 - dones) evaluation = self.critic_model.fit([current_states, actions], rewards, verbose=0) def train(self): if self.replay_buffer.size() > self.batch_size: samples = self.replay_buffer.sample_batch(self.batch_size) self.train_actor(samples) self.train_critic(samples) # TARGET MODEL UPDATES def update_actor_target(self): actor_model_weights = self.actor_model.get_weights() target_actor_model_weights = self.target_actor_model.get_weights() for i in range(len(target_actor_model_weights)): target_actor_model_weights[i] = actor_model_weights[ i] * self.tau + target_actor_model_weights[i] * (1.0 - self.tau) self.target_actor_model.set_weights(target_actor_model_weights) def update_critic_target(self): critic_model_weights = self.critic_model.get_weights() target_critic_model_weights = self.target_critic_model.get_weights() for i in range(len(target_critic_model_weights)): target_critic_model_weights[i] = critic_model_weights[ i] * self.tau + target_critic_model_weights[i] * (1.0 - self.tau) self.target_critic_model.set_weights(target_critic_model_weights) def update_target_models(self): self.update_actor_target() self.update_critic_target() # ACTING FUNCTION def act(self, current_epsiode, current_state): noise = self.epsilon * self.noise.generate() action = self.actor_model.predict( current_state ) * self.high_action_bound_list + noise #TODO add linear mapping for affine space return np.clip(action, self.low_action_bound_list, self.high_action_bound_list)
class DDPG: def __init__(self, env, batch_size, mem_size, discount, actor_params, critic_params): self._batch_size = batch_size self._mem_size = mem_size self._discount = discount self._sess = tensorflow.Session() k_backend.set_session(self._sess) self._env = env self._state_dim = env.observation_space.shape[0] self._action_dim = env.action_space.shape[0] self._action_min = env.action_space.low self._action_max = env.action_space.high self._state_min = env.observation_space.low self._state_max = env.observation_space.high self._actor = Actor(self._sess, self._state_dim, self._action_dim, self._action_min, self._action_max, actor_params) self._critic = Critic(self._sess, 0.5, self._state_dim, self._action_dim, critic_params) self._memory = ReplayBuffer(mem_size) def get_action(self, state): return self._actor._model.predict(state) def train(self): ''' No training takes place until the replay buffer contains at least batch size number of experiences ''' if (self._memory.size() > self._batch_size): self._train() def _train(self): states, actions, rewards, done, next_states = self._memory.sample( self._batch_size) self._train_critic(states, actions, rewards, done, next_states) action_gradients = self._critic.action_gradients(states, actions) self._actor.train(states, action_gradients) def q_estimate(self, state, action): return self._critic._model.predict(state, action) def _get_q_targets(self, next_states, done, rewards): ''' q = r if done else = r + gamma * qnext ''' # use actor network to determine the next action under current policy # estimate Q values from the critic network actions = self.get_action(next_states) qnext = self.q_estimate(next_states, actions) q_targets = [ reward if end else reward * self._discount * next_q for (reward, next_q, end) in zip(rewards, qnext, done) ] return q_targets def _train_critic(self, states, actions, rewards, done, next_states): q_targets = self._get_q_targets(next_states, done, rewards) self._critic.train(states, actions, q_targets) def experience(self, state, action, reward, done, next_state): # store in replay buffer self._memory.add(state, action, reward, done, next_state) self.train()
class TD3: def __init__(self, env, sess, low_action_bound_list, high_action_bound_list): self.env = env self.sess = sess self.low_action_bound_list = low_action_bound_list # depends on the env self.high_action_bound_list = high_action_bound_list self.action_range_bound = [ hi - lo for hi, lo in zip(self.high_action_bound_list, self.low_action_bound_list) ] self.learning_rate = 0.0001 self.exploration_noise = 0.1 self.gamma = 0.90 self.tau = 0.01 self.buffer_size = 10000 self.batch_size = 128 self.policy_noise = 0.1 self.noise_clip = 0.05 self.exploration_episodes = 10 # self.policy_freq = 2 self.state_dim = self.env.observation_space.shape[0] self.action_dim = len(self.low_action_bound_list ) #self.env.action_space, make this into input self.continuous_action_space = True # Initialize replay buffer self.replay_buffer = ReplayBuffer(self.buffer_size) # Creating ACTOR model actor_ = Actor(self.state_dim, self.action_dim, self.learning_rate) self.actor_state_input, self.actor_model = actor_.create_actor_model() _, self.target_actor_model = actor_.create_actor_model() self.actor_critic_grad = tf.placeholder(tf.float32, [None, self.action_dim]) actor_model_weights = self.actor_model.trainable_weights self.actor_grads = tf.gradients(self.actor_model.output, actor_model_weights, -self.actor_critic_grad) grads = zip(self.actor_grads, actor_model_weights) self.optimize = tf.train.AdamOptimizer( self.learning_rate).apply_gradients(grads) # Creating FIRST CRITIC model, this is the one we train/optimize against critic_ = Critic(self.state_dim, self.action_dim, self.learning_rate) self.critic_state_input, self.critic_action_input, self.critic_model = critic_.create_critic_model( ) self.critic_model.compile(optimizer=Adam(lr=critic_.learning_rate), loss='') _, _, self.target_critic_model = critic_.create_critic_model() self.target_critic_model.compile( optimizer=Adam(lr=critic_.learning_rate), loss='') self.critic_grads = tf.gradients(self.critic_model.output[0], self.critic_action_input) self.sess.run(tf.initialize_all_variables()) def __repr__(self): return 'TD3_gamma{}_tau{}'.format(self.gamma, self.tau) # TRAINING FUNCTIONS def train_actor(self): if self.replay_buffer.size() > self.batch_size: samples = self.replay_buffer.sample_batch(self.batch_size) current_states, actions, rewards, next_states, dones = samples predicted_actions = self.actor_model.predict( current_states ) * self.high_action_bound_list #TODO create linear mapping for affine space grads = self.sess.run(self.critic_grads, feed_dict={ self.critic_state_input: current_states, self.critic_action_input: predicted_actions })[0] self.sess.run(self.optimize, feed_dict={ self.actor_state_input: current_states, self.actor_critic_grad: grads }) def train_critic(self): if self.replay_buffer.size() > self.batch_size: samples = self.replay_buffer.sample_batch(self.batch_size) current_states, actions, rewards, next_states, dones = samples target_actions = self.target_actor_model.predict( next_states) * self.high_action_bound_list # CCOMPUTING FIRST CRITIC # introduce area of noise to action for smoothing purposes noise = np.random.normal( size=len(self.action_range_bound)) * self.policy_noise clipped_noise = np.clip(noise, -self.noise_clip, self.noise_clip) # added above noise to target_actions and clip to be in range of valid actions target_actions = np.clip((target_actions + clipped_noise), self.low_action_bound_list, self.high_action_bound_list) target_q1_values, target_q2_values = self.target_critic_model.predict( [ next_states, target_actions, np.random.rand(self.batch_size, 1) ]) target_q_values = np.minimum(target_q1_values, target_q2_values) target_q = rewards + self.gamma * target_q_values * (1 - dones) # current_q1, current_q2 = self.critic_model.predict([current_states, actions, np.random.rand(self.batch_size, 1)]) history = self.critic_model.fit( [current_states, actions, target_q], verbose=0) # print('Loss: ',history.history['loss']) def train(self): if self.replay_buffer.size() > self.batch_size: samples = self.replay_buffer.sample_batch(self.batch_size) self.train_actor() self.train_critic() # TARGET MODEL UPDATES def update_actor_target(self): actor_model_weights = self.actor_model.get_weights() target_actor_model_weights = self.target_actor_model.get_weights() for i in range(len(target_actor_model_weights)): target_actor_model_weights[i] = actor_model_weights[ i] * self.tau + target_actor_model_weights[i] * (1.0 - self.tau) self.target_actor_model.set_weights(target_actor_model_weights) def update_critic_target(self): critic_model_weights = self.critic_model.get_weights() target_critic_model_weights = self.target_critic_model.get_weights() for i in range(len(target_critic_model_weights)): target_critic_model_weights[i] = critic_model_weights[ i] * self.tau + target_critic_model_weights[i] * (1.0 - self.tau) self.target_critic_model.set_weights(target_critic_model_weights) def update_target_models(self): self.update_actor_target() self.update_critic_target() # ACTING FUNCTION with epsilon greedy def act(self, current_epsiode, current_state): if current_epsiode < self.exploration_episodes: return np.random.uniform( self.low_action_bound_list, self.high_action_bound_list) * self.high_action_bound_list else: action = self.actor_model.predict( current_state) * self.high_action_bound_list + np.random.normal( 0, [ self.exploration_noise * hi for hi in self.high_action_bound_list ]) return np.clip(action, self.low_action_bound_list, self.high_action_bound_list)