def run(self):
        memory = ReplayMemory(self.memory_capacity,
                              transition_type=DQNTransition)

        optimizer = tf.keras.optimizers.Adam(lr=self.learning_rate)
        loss_fn = tf.keras.losses.Huber()

        while self.done_queue.qsize() < self.n_envs:
            while self.memory_queue.qsize() > 0:
                sample = self.memory_queue.get()
                memory.push(*sample)

            if len(memory) < self.batch_size:
                continue

            transitions = memory.sample(self.batch_size)
            batch = DQNTransition(*zip(*transitions))

            state_batch = tf.convert_to_tensor(batch.state)
            action_batch = tf.convert_to_tensor(batch.action)
            reward_batch = tf.convert_to_tensor(batch.reward, dtype=tf.float32)
            next_state_batch = tf.convert_to_tensor(batch.next_state)
            done_batch = tf.convert_to_tensor(batch.done)

            with tf.GradientTape() as tape:
                state_action_values = tf.math.reduce_sum(
                    self.model.policy_network(state_batch) *
                    tf.one_hot(action_batch, self.model.n_actions),
                    axis=1)

                next_state_values = tf.where(
                    done_batch, tf.zeros(self.batch_size),
                    tf.math.reduce_max(
                        self.model.target_network(next_state_batch), axis=1))

                expected_state_action_values = reward_batch + \
                    (self.discount_factor * next_state_values)
                loss_value = loss_fn(expected_state_action_values,
                                     state_action_values)

            variables = self.model.policy_network.trainable_variables
            gradients = tape.gradient(loss_value, variables)
            optimizer.apply_gradients(zip(gradients, variables))

            self.model_update_queue.put(self.model)
Пример #2
0
    def run(self):
        episode = 0

        for i in range(self.n_envs):
            self.model_update_queue.put(self.model.policy_network.to_json())

        memory = ReplayMemory(self.memory_capacity,
                              transition_type=DQNTransition)

        optimizer = tf.keras.optimizers.Adam(lr=self.learning_rate)
        loss_fn = tf.keras.losses.Huber()

        while self.done_queue.qsize() < self.n_envs:
            while self.sync_queue.qsize() < self.n_envs:
                while self.memory_queue.qsize() > 0:
                    sample = self.memory_queue.get()
                    memory.push(*sample)

            self.empty_queue(self.sync_queue)

            transitions = memory.sample(self.batch_size)
            batch = DQNTransition(*zip(*transitions))

            state_batch = tf.convert_to_tensor(batch.state)
            action_batch = tf.convert_to_tensor(batch.action)
            reward_batch = tf.convert_to_tensor(batch.reward, dtype=tf.float32)
            next_state_batch = tf.convert_to_tensor(batch.next_state)
            done_batch = tf.convert_to_tensor(batch.done)

            with tf.GradientTape() as tape:
                state_action_values = tf.math.reduce_sum(
                    self.model.policy_network(state_batch) *
                    tf.one_hot(action_batch, self.model.n_actions),
                    axis=1)

                next_state_values = tf.where(
                    done_batch, tf.zeros(self.batch_size),
                    tf.math.reduce_max(
                        self.model.target_network(next_state_batch), axis=1))

                expected_state_action_values = reward_batch + \
                                               (self.discount_factor * next_state_values)
                loss_value = loss_fn(expected_state_action_values,
                                     state_action_values)

            variables = self.model.policy_network.trainable_variables
            gradients = tape.gradient(loss_value, variables)
            optimizer.apply_gradients(zip(gradients, variables))

            self.model.update_target_network()

            json_model = self.model.policy_network.to_json()
            for n in range(self.n_envs):
                self.model_update_queue.put(json_model)

            for n in range(self.n_envs * 2):
                self.sync_queue.put(1)

            episode += 1
            if episode == self.n_episodes:
                break

            while self.model_update_queue.qsize() > 0:
                time.sleep(.1)

            print("")
            self.empty_queue(self.sync_queue)

        self.empty_queue(self.model_update_queue)

        time.sleep(2)

        return self.model