Пример #1
0
        taskid, random_state=run_id)

    # nn with embedding related
    nn_with_embedding = NN_with_EntityEmbedding(
        X_train,
        y_train_int,
        categorical_features,
        categorical_names,
        class_names,
        epochs=epochs,
        batch_size=batch_size,
    )
    nn_with_embedding_loss, nn_with_embedding_score = nn_with_embedding.evaluate(
        X_test, y_test_int)
    print("nn_with_embedding prediction score: ", str(nn_with_embedding_score))
    logger.log('nn_with_embedding', taskid, run_id, nn_with_embedding_score)

    # nn related
    nn = NN(
        X_train,
        y_train_int,
        categorical_features,
        categorical_names,
        class_names,
        epochs=epochs,
        batch_size=batch_size,
    )
    nn_loss, nn_score = nn.evaluate(X_test, y_test_int)
    print("nn prediction score: ", str(nn_score))
    logger.log('nn', taskid, run_id, nn_score)
Пример #2
0
        batch_ac.append(ac)
        batch_rcw.append((row, col-w, w-prev_w))
        batch_frames.append(frames)
    batch_prev_frames = np.array(batch_prev_frames)
    batch_ac = np.array(batch_ac)
    batch_rcw = np.array(batch_rcw)[:, None, :]
    batch_frames = np.array(batch_frames)
    q_map._optimize(batch_prev_frames, batch_ac, batch_rcw, batch_frames, batch_dones, batch_weights)
    if t % args.target == 0:
        q_map.update_target()

    if t % 50 == 0:
        losses = []
        all_images = []
        for i_level in range(len(test_levels)):
            pred_qmaps = q_map.compute_q_values(test_obs[i_level])
            true_qmaps = test_qmaps[i_level]
            loss = np.mean((pred_qmaps - true_qmaps)**2)
            losses.append(loss)
            ob_images = np.concatenate(test_obs[i_level][image_indexes[i_level]], axis=1)
            pred_images = np.concatenate((color_map(pred_qmaps[image_indexes[i_level]].max(3))[:, :, :, :3] * 255).astype(np.uint8), axis=1)
            true_images = np.concatenate((color_map(true_qmaps[image_indexes[i_level]].max(3))[:, :, :, :3] * 255).astype(np.uint8), axis=1)
            all_images.append(np.concatenate((ob_images, true_images, pred_images), axis=0))
        img = np.concatenate(all_images, axis=0)
        toimage(img, cmin=0, cmax=255).save('{}/images/{}.png'.format(path, t))
        if args.render:
            img = np.repeat(np.repeat(img, 3, 0), 3, 1)
            viewer.imshow(img)
        print(t*args.batch, 'Losses:', *losses)
        loss_logger.log(t, *losses)
Пример #3
0
class Q_Map_DQN_Agent(Agent):
    def __init__(
            self,
            # All
            observation_space,
            n_actions,
            coords_shape,
            double_replay_buffer,
            task_gamma,
            exploration_schedule,
            seed,
            learning_starts=1000,
            train_freq=1,
            print_freq=100,
            env_name='ENV',
            agent_name='AGENT',
            renderer_viewer=True,
            # DQN:
            dqn_q_func=None,
            dqn_lr=5e-4,
            dqn_batch_size=32,
            dqn_optim_iters=1,
            dqn_target_net_update_freq=500,
            dqn_grad_norm_clip=100,
            dqn_double_q=True,
            # Q-Map:
            q_map_model=None,
            q_map_random_schedule=None,
            q_map_greedy_bias=0.5,
            q_map_timer_bonus=0.5,
            q_map_lr=5e-4,
            q_map_gamma=0.9,
            q_map_n_steps=1,
            q_map_batch_size=32,
            q_map_optim_iters=1,
            q_map_target_net_update_freq=500,
            q_map_min_goal_steps=10,
            q_map_max_goal_steps=20,
            q_map_grad_norm_clip=1000,
            q_map_double_q=True):

        # All

        self.observation_space = observation_space
        self.n_actions = n_actions
        self.coords_shape = coords_shape
        self.double_replay_buffer = double_replay_buffer
        self.task_gamma = task_gamma
        self.exploration_schedule = exploration_schedule
        self.learning_starts = learning_starts
        self.train_freq = train_freq
        self.print_freq = print_freq

        agent_name += '-train' + str(train_freq)

        # DQN

        if dqn_q_func is not None:
            self.use_dqn = True
            agent_name += '-'
            agent_name += 'DQN-lr' + str(dqn_lr) + '-freq-' + str(train_freq)
            self.dqn_target_net_update_freq = dqn_target_net_update_freq

            self.dqn = DQN(model=dqn_q_func,
                           observation_space=observation_space,
                           n_actions=n_actions,
                           gamma=task_gamma,
                           lr=dqn_lr,
                           replay_buffer=double_replay_buffer,
                           batch_size=dqn_batch_size,
                           optim_iters=dqn_optim_iters,
                           grad_norm_clip=dqn_grad_norm_clip,
                           double_q=dqn_double_q)
        else:
            self.use_dqn = False

        # Q-MAP

        if q_map_model is not None:
            agent_name += '-'
            agent_name += 'Q-MAP-' + q_map_model.description + '-' + str(
                q_map_min_goal_steps
            ) + '-' + str(q_map_max_goal_steps) + '-gamma' + str(
                q_map_gamma) + '-lr' + str(q_map_lr) + '-bias' + str(
                    q_map_greedy_bias) + '-bonus' + str(q_map_timer_bonus)
            self.use_q_map = True
            self.q_map_timer_bonus = q_map_timer_bonus
            self.using_q_map_starts = 2 * self.learning_starts
            self.q_map_random_schedule = q_map_random_schedule
            self.q_map_greedy_bias = q_map_greedy_bias
            self.q_map_goal_proba = 1  # TODO
            self.q_map_gamma = q_map_gamma
            self.q_map_target_net_update_freq = q_map_target_net_update_freq
            self.q_map_min_goal_steps = q_map_min_goal_steps
            self.q_map_max_goal_steps = q_map_max_goal_steps
            self.q_map_min_q_value = q_map_gamma**(q_map_max_goal_steps - 1)
            self.q_map_max_q_value = q_map_gamma**(q_map_min_goal_steps - 1)
            self.q_map_goal = None
            self.q_map_goal_timer = 0

            self.q_map = Q_Map(model=q_map_model,
                               observation_space=observation_space,
                               coords_shape=coords_shape,
                               n_actions=n_actions,
                               gamma=q_map_gamma,
                               n_steps=q_map_n_steps,
                               lr=q_map_lr,
                               replay_buffer=double_replay_buffer,
                               batch_size=q_map_batch_size,
                               optim_iters=q_map_optim_iters,
                               grad_norm_clip=q_map_grad_norm_clip,
                               double_q=q_map_double_q)
        else:
            self.use_q_map = False

        if not self.use_dqn and not self.use_q_map:
            agent_name += 'random'

        else:
            self.tf_saver = tf.train.Saver()
            agent_name += '-memory' + str(double_replay_buffer._maxsize)

        # All

        home = os.path.expanduser('~')
        sub_name = 'seed-{}_{}'.format(
            seed,
            datetime.utcnow().strftime('%F_%H-%M-%S-%f'))
        self.path = '{}/results/q-map/{}/{}/{}'.format(home, env_name,
                                                       agent_name, sub_name)
        # log exploration for debugging
        exploration_labels = [
            'steps', 'planned exploration', 'current exploration',
            'random actions', 'goal actions', 'greedy actions'
        ]
        self.exploration_logger = CSVLogger(exploration_labels,
                                            self.path + '/exploration')
        # videos etc.
        self.renderer = Q_Map_Renderer(self.path, viewer=renderer_viewer)
        # path to store
        self.tensorflow_path = self.path + '/tensorflow'
        if not os.path.exists(self.tensorflow_path):
            os.makedirs(self.tensorflow_path)

        U.initialize()
        self.t = 0
        self.episode_rewards = []
        self.random_proba = self.exploration_schedule.value(0)
        self.random_freq = self.exploration_schedule.value(0)
        self.greedy_freq = 1.0 - self.random_freq
        self.goal_freq = 0.0

        if self.use_dqn:
            self.dqn.update_target()

        self.seed(seed)

    def seed(self, seed):
        self.np_random, seed = seeding.np_random(seed)
        if self.use_dqn:
            self.dqn.seed(seed)
        if self.use_q_map:
            self.q_map.seed(seed)
        return [seed]

    def reset(self, ob):
        if self.use_q_map:
            self.q_map_goal_timer = 0
            self.q_map_goal = None

        frames = ob[0]
        ac = self.choose_action(ob)

        self.log()
        self.episode_rewards.append(0.0)
        self.prev_ob = ob
        self.prev_ac = ac

        return ac

    def step(self, ob, rew, done):
        prev_frames, (_, _, prev_w), _, _ = self.prev_ob
        frames, (row, col, w), _, _ = ob

        if self.double_replay_buffer is not None:
            self.double_replay_buffer.add(prev_frames, self.prev_ac, rew,
                                          (row, col - w, w - prev_w), frames,
                                          done)

        self.optimize()

        if not done:
            ac = self.choose_action(ob)
        else:
            ac = None
            self.add_to_renderer(ob)

        self.t += 1
        self.episode_rewards[-1] += rew
        self.prev_ob = ob
        self.prev_ac = ac

        return ac

    def choose_action(self, ob):
        frames, (row, col, w), screen, (full_r, full_c) = ob

        q_map_values = None
        q_map_candidates = []
        q_map_biased_candidates = []

        # render Q-maps all the time even if we do not need them
        if self.use_q_map:
            q_map_values = self.q_map.compute_q_values(
                frames[None])[0]  # (rows, cols, acs)

        if self.np_random.rand() < self.random_proba or (
                not self.use_dqn and self.t <= self.using_q_map_starts):
            ac = self.np_random.randint(self.n_actions)
            action_type = 'random'

        else:
            # Q-Map available and started to train
            if self.use_q_map and self.t > self.using_q_map_starts:
                # reached goal
                if self.q_map_goal_timer > 0 and self.q_map_goal[1] < w:
                    self.q_map_goal_timer = 0
                    self.q_map_goal = None

                # goal unreachable
                if self.q_map_goal_timer > 0 and (row, col) == self.q_map_goal:
                    self.q_map_goal_timer = 0
                    self.q_map_goal = None

                # no more goal
                if self.q_map_goal_timer == 0:
                    if self.np_random.rand() < self.q_map_goal_proba:
                        # find a new goal
                        q_map_max_values = q_map_values.max(2)  # (rows, cols)
                        q_map_candidates_mask = np.logical_and(
                            self.q_map_min_q_value <= q_map_max_values,
                            self.q_map_max_q_value >= q_map_max_values)
                        q_map_candidates = np.where(q_map_candidates_mask)
                        q_map_candidates = np.dstack(q_map_candidates)[
                            0]  # list of (row, col)

                        if len(q_map_candidates) > 0:
                            # goals compatible with greedy action
                            if self.use_dqn and self.np_random.rand(
                            ) < self.q_map_greedy_bias:
                                greedy_ac = self.dqn.choose_action(
                                    frames, stochastic=False)
                                q_map_biased_candidates_mask = np.logical_and(
                                    q_map_candidates_mask,
                                    q_map_values.argmax(2) == greedy_ac)
                                q_map_biased_candidates = np.where(
                                    q_map_biased_candidates_mask)
                                q_map_biased_candidates = np.dstack(
                                    q_map_biased_candidates)[
                                        0]  # list of (row, col)

                            # same DQN and Q-Map action
                            if len(q_map_biased_candidates) > 0:
                                goal_idx = self.np_random.randint(
                                    len(q_map_biased_candidates))
                                q_map_goal_row, q_map_goal_col_local = q_map_biased_candidates[
                                    goal_idx]
                                q_map_expected_steps = math.log(
                                    q_map_max_values[q_map_goal_row,
                                                     q_map_goal_col_local],
                                    self.q_map_gamma) + 1
                                self.q_map_goal_timer = math.ceil(
                                    1.5 * q_map_expected_steps)  # 50% bonus
                                self.q_map_goal = (q_map_goal_row,
                                                   q_map_goal_col_local + w)
                                ac = greedy_ac
                                action_type = 'dqn/qmap'

                            # greedy Q-Map action
                            else:
                                goal_idx = self.np_random.randint(
                                    len(q_map_candidates))
                                q_map_goal_row, q_map_goal_col_local = q_map_candidates[
                                    goal_idx]
                                q_map_expected_steps = math.log(
                                    q_map_max_values[q_map_goal_row,
                                                     q_map_goal_col_local],
                                    self.q_map_gamma) + 1
                                self.q_map_goal_timer = math.ceil(
                                    (1. + self.q_map_timer_bonus) *
                                    q_map_expected_steps)
                                self.q_map_goal = (q_map_goal_row,
                                                   q_map_goal_col_local + w)
                                ac, q_map_values = self.q_map.choose_action(
                                    None,
                                    (q_map_goal_row, q_map_goal_col_local),
                                    q_map_values
                                )  # no need to recompute the Q-Map
                                action_type = 'qmap'

                            self.q_map_goal_timer -= 1
                            if self.q_map_goal_timer == 0:
                                self.q_map_goal = None

                        # random action
                        else:
                            self.q_map_goal_timer = 0
                            self.q_map_goal = None
                            ac = self.np_random.randint(self.n_actions)
                            action_type = 'random'

                    # DQN action
                    else:
                        ac = self.dqn.choose_action(frames, stochastic=False)
                        action_type = 'dqn'

                # Q-Map action
                else:
                    q_map_goal_row, q_map_goal_col = self.q_map_goal
                    q_map_goal_col_local = q_map_goal_col - w
                    ac, q_map_values = self.q_map.choose_action(
                        frames, (q_map_goal_row, q_map_goal_col_local))
                    self.q_map_goal_timer -= 1
                    if self.q_map_goal_timer == 0:
                        self.q_map_goal = None
                    action_type = 'qmap'

            # DQN action
            else:
                ac = self.dqn.choose_action(frames, stochastic=False)
                action_type = 'dqn'

        # rendering
        self.add_to_renderer(ob, q_map_values, ac, action_type,
                             q_map_candidates, q_map_biased_candidates)

        # update exploration
        if action_type == 'dqn/qmap':
            self.random_freq += 0.01 * (0 - self.random_freq)
            self.greedy_freq += 0.01 * (1 - self.greedy_freq)
            self.goal_freq += 0.01 * (0 - self.goal_freq)  # TODO: 1?
        elif action_type == 'dqn':
            self.random_freq += 0.01 * (0 - self.random_freq)
            self.greedy_freq += 0.01 * (1 - self.greedy_freq)
            self.goal_freq += 0.01 * (0 - self.goal_freq)
        elif action_type == 'qmap':
            self.random_freq += 0.01 * (0 - self.random_freq)
            self.greedy_freq += 0.01 * (0 - self.greedy_freq)
            self.goal_freq += 0.01 * (1 - self.goal_freq)
        elif action_type == 'random':
            self.random_freq += 0.01 * (1 - self.random_freq)
            self.greedy_freq += 0.01 * (0 - self.greedy_freq)
            self.goal_freq += 0.01 * (0 - self.goal_freq)
        else:
            raise NotImplementedError(
                'unknown action type {}'.format(action_type))

        target_exploration = self.exploration_schedule.value(self.t)
        current_exploration = (1.0 - self.greedy_freq)
        if self.use_q_map and self.t >= self.using_q_map_starts:
            self.random_proba = self.q_map_random_schedule.value(self.t)
            if current_exploration > target_exploration:
                self.q_map_goal_proba -= 0.001
            elif current_exploration < target_exploration:
                self.q_map_goal_proba += 0.001
        else:
            self.random_proba = self.exploration_schedule.value(self.t)

        if (self.t + 1) % 100 == 0:
            self.exploration_logger.log(self.t + 1, target_exploration,
                                        current_exploration, self.random_freq,
                                        self.goal_freq, self.greedy_freq)

        return ac

    def optimize(self):
        if (
                self.use_dqn or self.use_q_map
        ) and self.t >= self.learning_starts and self.t % self.train_freq == 0:
            if self.use_dqn:
                self.dqn.optimize(self.t)

            if self.use_q_map:
                self.q_map.optimize(self.t)

        if self.use_dqn and self.t >= self.learning_starts and self.t % self.dqn_target_net_update_freq == 0:
            self.dqn.update_target()

        if self.use_q_map and self.t >= self.learning_starts and self.t % self.q_map_target_net_update_freq == 0:
            self.q_map.update_target()

        # save the session
        if (self.use_dqn or self.use_q_map) and (self.t + 1) % 100000 == 0:
            file_name = self.tensorflow_path + '/step_' + str(self.t +
                                                              1) + '.ckpt'
            print('saving tensorflow session to', file_name)
            self.tf_saver.save(tf.get_default_session(), file_name)

    def log(self):
        if self.t > 0 and self.print_freq is not None and len(
                self.episode_rewards) % self.print_freq == 0:
            mean_100ep_reward = np.mean(self.episode_rewards[-100:])
            num_episodes = len(self.episode_rewards)

            logger.record_tabular('steps', self.t)
            logger.record_tabular('episodes', num_episodes)
            logger.record_tabular('mean 100 episode reward',
                                  '{:.3f}'.format(mean_100ep_reward))
            logger.record_tabular(
                'exploration (target)', '{:.3f} %'.format(
                    100 * self.exploration_schedule.value(self.t)))
            logger.record_tabular(
                'exploration (current)',
                '{:.3f} %'.format(100 * (1.0 - self.greedy_freq)))
            logger.dump_tabular()

    def load(self, path):
        self.tf_saver.restore(tf.get_default_session(), path)
        print('model restored :)')

    def add_to_renderer(self,
                        ob,
                        q_map_values=None,
                        ac=None,
                        action_type='',
                        q_map_candidates=[],
                        q_map_biased_candidates=[]):
        if self.renderer is not None:
            if self.use_q_map and self.q_map_goal is not None:
                goal = self.q_map_goal
                assert self.q_map_goal_timer > 0
            else:
                goal = None
            self.renderer.add(ob, self.coords_shape, q_map_values, ac,
                              action_type, self.n_actions, q_map_candidates,
                              q_map_biased_candidates, goal)