def run(self): memory = ReplayMemory(self.memory_capacity, transition_type=DQNTransition) optimizer = tf.keras.optimizers.Adam(lr=self.learning_rate) loss_fn = tf.keras.losses.Huber() while self.done_queue.qsize() < self.n_envs: while self.memory_queue.qsize() > 0: sample = self.memory_queue.get() memory.push(*sample) if len(memory) < self.batch_size: continue transitions = memory.sample(self.batch_size) batch = DQNTransition(*zip(*transitions)) state_batch = tf.convert_to_tensor(batch.state) action_batch = tf.convert_to_tensor(batch.action) reward_batch = tf.convert_to_tensor(batch.reward, dtype=tf.float32) next_state_batch = tf.convert_to_tensor(batch.next_state) done_batch = tf.convert_to_tensor(batch.done) with tf.GradientTape() as tape: state_action_values = tf.math.reduce_sum( self.model.policy_network(state_batch) * tf.one_hot(action_batch, self.model.n_actions), axis=1) next_state_values = tf.where( done_batch, tf.zeros(self.batch_size), tf.math.reduce_max( self.model.target_network(next_state_batch), axis=1)) expected_state_action_values = reward_batch + \ (self.discount_factor * next_state_values) loss_value = loss_fn(expected_state_action_values, state_action_values) variables = self.model.policy_network.trainable_variables gradients = tape.gradient(loss_value, variables) optimizer.apply_gradients(zip(gradients, variables)) self.model_update_queue.put(self.model)
def run(self): episode = 0 for i in range(self.n_envs): self.model_update_queue.put(self.model.policy_network.to_json()) memory = ReplayMemory(self.memory_capacity, transition_type=DQNTransition) optimizer = tf.keras.optimizers.Adam(lr=self.learning_rate) loss_fn = tf.keras.losses.Huber() while self.done_queue.qsize() < self.n_envs: while self.sync_queue.qsize() < self.n_envs: while self.memory_queue.qsize() > 0: sample = self.memory_queue.get() memory.push(*sample) self.empty_queue(self.sync_queue) transitions = memory.sample(self.batch_size) batch = DQNTransition(*zip(*transitions)) state_batch = tf.convert_to_tensor(batch.state) action_batch = tf.convert_to_tensor(batch.action) reward_batch = tf.convert_to_tensor(batch.reward, dtype=tf.float32) next_state_batch = tf.convert_to_tensor(batch.next_state) done_batch = tf.convert_to_tensor(batch.done) with tf.GradientTape() as tape: state_action_values = tf.math.reduce_sum( self.model.policy_network(state_batch) * tf.one_hot(action_batch, self.model.n_actions), axis=1) next_state_values = tf.where( done_batch, tf.zeros(self.batch_size), tf.math.reduce_max( self.model.target_network(next_state_batch), axis=1)) expected_state_action_values = reward_batch + \ (self.discount_factor * next_state_values) loss_value = loss_fn(expected_state_action_values, state_action_values) variables = self.model.policy_network.trainable_variables gradients = tape.gradient(loss_value, variables) optimizer.apply_gradients(zip(gradients, variables)) self.model.update_target_network() json_model = self.model.policy_network.to_json() for n in range(self.n_envs): self.model_update_queue.put(json_model) for n in range(self.n_envs * 2): self.sync_queue.put(1) episode += 1 if episode == self.n_episodes: break while self.model_update_queue.qsize() > 0: time.sleep(.1) print("") self.empty_queue(self.sync_queue) self.empty_queue(self.model_update_queue) time.sleep(2) return self.model