def play(self, current_weights, epsilon=0.01):

        tf.config.set_visible_devices([], 'GPU')

        self.qnet.set_weights(current_weights)

        episode_steps, episode_rewards = 0, 0

        frame = preprocess_frame(self.env.reset())
        for _ in range(self.n_frames):
            self.frames.append(frame)

        state = np.stack(self.frames, axis=2)[np.newaxis, ...]

        done = False
        while not done:

            state = np.stack(self.frames, axis=2)[np.newaxis, ...]

            action = self.qnet.sample_action(state, epsilon=epsilon)

            next_frame, reward, done, _ = self.env.step(action)

            self.frames.append(preprocess_frame(next_frame))

            episode_steps += 1

            episode_rewards += reward

            if episode_steps > 1000 and episode_rewards < 10:
                break

        return episode_steps, episode_rewards
    def test_play(self, n_testplay=1, monitor_dir=None, checkpoint_path=None):

        if checkpoint_path:
            env = gym.make(self.env_name)
            frame = util.preprocess_frame(env.reset())
            frames = collections.deque([frame] * self.n_frames,
                                       maxlen=self.n_frames)

            state = np.stack(frames, axis=2)[np.newaxis, ...]
            self.qnet(state)
            self.qnet.load_weights(checkpoint_path)

        if monitor_dir:
            monitor_dir = Path(monitor_dir)
            if monitor_dir.exists():
                shutil.rmtree(monitor_dir)
            monitor_dir.mkdir()
            env = gym.wrappers.Monitor(gym.make(self.env_name),
                                       monitor_dir,
                                       force=True,
                                       video_callable=(lambda ep: True))
        else:
            env = gym.make(self.env_name)

        scores = []
        steps = []
        for _ in range(n_testplay):

            frame = util.preprocess_frame(env.reset())
            frames = collections.deque([frame] * self.n_frames,
                                       maxlen=self.n_frames)

            done = False
            episode_steps = 0
            episode_rewards = 0

            while not done:
                state = np.stack(frames, axis=2)[np.newaxis, ...]
                epsilon = 0 if self.use_noisy else 0.05
                action = self.qnet.sample_action(state, epsilon)
                next_frame, reward, done, _ = env.step(action)
                frames.append(util.preprocess_frame(next_frame))

                episode_rewards += reward
                episode_steps += 1
                if episode_steps > 500 and episode_rewards < 3:
                    #: ゲーム開始(action: 0)しないまま停滞するケースへの対処
                    break

            scores.append(episode_rewards)
            steps.append(episode_steps)

        return scores, steps
    def define_network(self):

        #: hide GPU from remote actor
        tf.config.set_visible_devices([], 'GPU')

        #: define by run
        frame = preprocess_frame(self.env.reset())
        for _ in range(self.n_frames):
            self.frames.append(frame)

        state = np.stack(self.frames, axis=2)[np.newaxis, ...]
        self.local_qnet(state)
    def play_with_video(self, checkpoint_path, monitor_dir, epsilon=0.01):

        monitor_dir = Path(monitor_dir)
        if monitor_dir.exists():
            shutil.rmtree(monitor_dir)
        monitor_dir.mkdir()
        env = gym.wrappers.Monitor(gym.make(self.env_name),
                                   monitor_dir,
                                   force=True,
                                   video_callable=(lambda ep: True))

        frame = preprocess_frame(env.reset())
        frames = collections.deque([frame] * self.n_frames,
                                   maxlen=self.n_frames)

        state = np.stack(frames, axis=2)[np.newaxis, ...]
        self.qnet(state)
        self.qnet.load_weights(checkpoint_path)

        episode_steps, episode_rewards = 0, 0

        state = np.stack(self.frames, axis=2)[np.newaxis, ...]

        done = False
        while not done:

            state = np.stack(self.frames, axis=2)[np.newaxis, ...]

            action = self.qnet.sample_action(state, epsilon)

            next_frame, reward, done, _ = self.env.step(action)

            self.frames.append(preprocess_frame(next_frame))

            episode_steps += 1

            episode_rewards += reward

        return episode_rewards
    def define_network(self):

        env = gym.make(self.env_name)
        frame = preprocess_frame(env.reset())
        frames = [frame] * self.n_frames
        state = np.stack(frames, axis=2)[np.newaxis, ...]

        #: define by run
        self.qnet(state)
        self.target_qnet(state)
        self.target_qnet.set_weights(self.qnet.get_weights())

        return self.qnet.get_weights()
    def learn(self, n_episodes, logdir="log"):

        logdir = Path(__file__).parent / logdir
        if logdir.exists():
            shutil.rmtree(logdir)
        self.summary_writer = tf.summary.create_file_writer(str(logdir))

        for episode in range(1, n_episodes + 1):
            env = gym.make(self.env_name)

            frame = util.preprocess_frame(env.reset())
            frames = collections.deque([frame] * self.n_frames,
                                       maxlen=self.n_frames)

            episode_rewards = 0
            episode_steps = 0
            done = False
            lives = 5
            while not done:

                self.steps, episode_steps = self.steps + 1, episode_steps + 1

                state = np.stack(frames, axis=2)[np.newaxis, ...]

                action = self.qnet.sample_action(state, self.epsilon)

                next_frame, reward, done, info = env.step(action)

                episode_rewards += reward

                frames.append(util.preprocess_frame(next_frame))

                next_state = np.stack(frames, axis=2)[np.newaxis, ...]

                if info["ale.lives"] != lives:
                    lives = info["ale.lives"]
                    transition = (state, action, reward, next_state, True)
                else:
                    transition = (state, action, reward, next_state, done)

                self.replay_buffer.push(transition)

                if len(self.replay_buffer) >= 50000:
                    if self.steps % self.update_period == 0:

                        if self.use_categorical:
                            loss = self.update_categorical_network()
                        else:
                            loss = self.update_network()

                        with self.summary_writer.as_default():
                            tf.summary.scalar("loss", loss, step=self.steps)
                            tf.summary.scalar("buffer_size",
                                              len(self.replay_buffer),
                                              step=self.steps)
                            tf.summary.scalar("epsilon",
                                              self.epsilon,
                                              step=self.steps)
                            tf.summary.scalar("train_score",
                                              episode_rewards,
                                              step=self.steps)
                            tf.summary.scalar("train_steps",
                                              episode_steps,
                                              step=self.steps)

                    if self.steps % self.target_update_period == 0:
                        self.target_qnet.set_weights(self.qnet.get_weights())

            print(
                f"Episode: {episode}, score: {episode_rewards}, steps: {episode_steps}"
            )
            if episode % 20 == 0:
                test_scores, test_steps = self.test_play(n_testplay=1)
                with self.summary_writer.as_default():
                    tf.summary.scalar("test_score",
                                      test_scores[0],
                                      step=self.steps)
                    tf.summary.scalar("test_step",
                                      test_steps[0],
                                      step=self.steps)
                    for layer in self.qnet.layers[-3:]:
                        for var in layer.variables:
                            tf.summary.histogram(var.name,
                                                 var,
                                                 step=self.steps)

            if episode % 500 == 0:
                self.qnet.save_weights("checkpoints/qnet")
    def rollout(self, current_weights):

        tf.config.set_visible_devices([], 'GPU')

        self.local_qnet.set_weights(current_weights)

        state = np.stack(self.frames, axis=2)[np.newaxis, ...]

        for _ in range(self.buffer_size):

            state = np.stack(self.frames, axis=2)[np.newaxis, ...]

            action = self.local_qnet.sample_action(state, self.epsilon)

            next_frame, reward, done, info = self.env.step(action)

            self.episode_steps += 1

            self.episode_rewards += reward

            self.frames.append(preprocess_frame(next_frame))

            next_state = np.stack(self.frames, axis=2)[np.newaxis, ...]

            if self.lives != info["ale.lives"]:
                #: loss of life as episode ends
                transition = (state, action, reward, next_state, True)
                self.lives = info["ale.lives"]
            else:
                transition = (state, action, reward, next_state, done)

            self.local_buffer.push(transition)

            if done:
                print(self.pid, self.episode_steps, self.episode_rewards,
                      round(self.epsilon, 3))
                self.episode_steps = 0
                self.episode_rewards = 0
                self.lives = 5
                frame = preprocess_frame(self.env.reset())
                for _ in range(self.n_frames):
                    self.frames.append(frame)

        experiences = self.local_buffer.pull()

        states = np.vstack([exp.state
                            for exp in experiences]).astype(np.float32)
        actions = np.vstack([exp.action
                             for exp in experiences]).astype(np.float32)
        rewards = np.array([exp.reward for exp in experiences]).reshape(-1, 1)
        next_states = np.vstack([exp.next_state
                                 for exp in experiences]).astype(np.float32)
        dones = np.array([exp.done for exp in experiences]).reshape(-1, 1)

        next_actions, next_qvalues = self.local_qnet.sample_actions(
            next_states)

        next_actions_onehot = tf.one_hot(next_actions, self.action_space)

        max_next_qvalues = tf.reduce_sum(next_qvalues * next_actions_onehot,
                                         axis=1,
                                         keepdims=True)

        TQ = rewards + self.gamma**(self.nstep) * (1 -
                                                   dones) * max_next_qvalues

        qvalues = self.local_qnet(states)
        actions_onehot = tf.one_hot(actions.flatten().astype(np.int32),
                                    self.action_space)
        Q = tf.reduce_sum(qvalues * actions_onehot, axis=1, keepdims=True)

        priorities = ((np.abs(TQ - Q) + 0.001)**self.alpha).flatten()

        experiences = [zlib.compress(pickle.dumps(exp)) for exp in experiences]

        return priorities, experiences, self.pid
        x2 = self.dense2(x)
        advantages = self.advantages(x2)
        advantages = tf.reshape(advantages,
                                (batch_size, self.action_space, self.n_atoms))

        advantages_mean = tf.reduce_mean(advantages, axis=1, keepdims=True)
        advantages_scaled = advantages - advantages_mean

        logits = value + advantages_scaled
        probs = tf.nn.softmax(logits, axis=2)

        return probs


if __name__ == "__main__":
    import util
    import gym

    env = gym.make("BreakoutDeterministic-v4")
    frame = util.preprocess_frame(env.reset())
    frames = [frame] * 4
    state = np.stack(frames, axis=2)[np.newaxis, ...]

    action_space = 4
    model = NoisyQNetwork(action_space)
    out = model(state)
    import pdb
    pdb.set_trace()
    print(out)