Exemplo n.º 1
0
def evaluate_policy(policy, env, eval_episodes=10):
    reward_arr = np.zeros(eval_episodes)

    for i in range(eval_episodes):
        obs = env.reset()
        done = False
        total_reward = 0.
        while not done:
            feasible_actions = AllocationEnv.get_feasible_actions(
                obs["board_config"])
            action_mask = AllocationEnv.get_action_mask(
                feasible_actions, env.action_space.n)

            action, _states = policy.predict(obs, mask=action_mask)
            action = AllocationEnv.check_action(obs['board_config'], action)
            obs, reward, done, _ = env.step(action)
            total_reward += reward

        reward_arr[i] = total_reward

    avg_reward = reward_arr.mean()
    std_reward = reward_arr.std()

    print("---------------------------------------")
    print("Evaluation over {} episodes: {:.1f} ({:.2f})".format(
        eval_episodes, avg_reward, std_reward))
    print("---------------------------------------")
    return avg_reward, std_reward
Exemplo n.º 2
0
def map_optimal_rewards(tabu_len, k):
    state = env.reset()
    total_reward = 0
    results = {'rewards': [0.0]}
    optimal_actions = []




    for day in range(TEST_T):
        curr_best_val = 0.0
        curr_best_action = 0.0

        curr_state = copy.deepcopy(env.state)
        feasible_actions = AllocationEnv.get_feasible_actions(curr_state.board_config)


        for action in feasible_actions:

            print("Iteration: {}, Action: {}".format(day, action), end='\r')
            action = AllocationEnv.check_action(curr_state.board_config, action)
            proposed_state, reward, b, i = env.step(action)
            env.set_state(curr_state)

            if reward > curr_best_val:
                curr_best_action = action


        optimal_actions.append(curr_best_action)
        curr_best_action = AllocationEnv.check_action(curr_state.board_config, curr_best_action)

        state, final_reward, _ , _ = env.step(curr_best_action)  # update the state after each day based on the optimal action taken

        total_reward += final_reward
        curr_best_val = final_reward
        results['rewards'].append(total_reward)
        print("best action: {} - reward: {}".format(curr_best_action, final_reward))
        print("total reward: {}".format(total_reward))


    return state, optimal_actions, results
Exemplo n.º 3
0
    def sample(self, board_cfg, prod_next, idx_to_prod):

        keys = list(self.action_space)[1:]

        while True:
            #a_idx, action = np.random.choice(keys)
            a_idx = np.random.choice(keys)

            a_idx = AllocationEnv.check_action(board_cfg, a_idx)

            if a_idx >= 0:
                mtx_idx, action = self.action_space[a_idx]

                if a_idx == 0 or idx_to_prod[mtx_idx[1]] in prod_next:
                    break

        a_mtx = np.zeros((self.n_regions, self.n_products))
        a_mtx[mtx_idx] = action
        return a_mtx, a_idx
Exemplo n.º 4
0
results = {'rewards': [0.0]}
buffer = ReplayBuffer(size=50000)

for j in range(100):

    obs = env.reset()

    for i in range(TEST_T):
        feasible_actions = AllocationEnv.get_feasible_actions(
            obs["board_config"])
        action_mask = AllocationEnv.get_action_mask(feasible_actions,
                                                    n_actions)
        action, _states = model.predict(obs, mask=action_mask)

        action = AllocationEnv.check_action(obs['board_config'], action)
        new_obs, r, dones, info = env.step([action])

        results['rewards'].append(r[0] + results['rewards'][-1])

        # add (s, a, r, s') to buffer
        buffer.add(obs_t=State.get_vec_observation(obs),
                   action=action,
                   reward=r[0],
                   obs_tp1=State.get_vec_observation(new_obs),
                   done=float(dones))

        obs = new_obs

with open("output/rl-test-{}.json".format(cfg.vals['prj_name']), 'w') as f:
    json.dump(results, f)
Exemplo n.º 5
0
    def learn(self,
              total_timesteps,
              callback=None,
              seed=None,
              log_interval=100,
              tb_log_name="DQN",
              reset_num_timesteps=True,
              replay_wrapper=None,
              learning_curve=False,
              test_t=None):

        new_tb_log = self._init_num_timesteps(reset_num_timesteps)

        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
                as writer:
            self._setup_learn(seed)

            # Create the replay buffer
            if self.prioritized_replay:
                self.replay_buffer = PrioritizedReplayBuffer(
                    self.buffer_size, alpha=self.prioritized_replay_alpha)
                if self.prioritized_replay_beta_iters is None:
                    prioritized_replay_beta_iters = total_timesteps
                else:
                    prioritized_replay_beta_iters = self.prioritized_replay_beta_iters
                self.beta_schedule = LinearSchedule(
                    prioritized_replay_beta_iters,
                    initial_p=self.prioritized_replay_beta0,
                    final_p=1.0)
            else:
                self.replay_buffer = ReplayBuffer(self.buffer_size)
                self.beta_schedule = None

            if replay_wrapper is not None:
                assert not self.prioritized_replay, "Prioritized replay buffer is not supported by HER"
                self.replay_buffer = replay_wrapper(self.replay_buffer)

            # Create the schedule for exploration starting from 1.
            self.exploration = LinearSchedule(
                schedule_timesteps=int(self.exploration_fraction *
                                       total_timesteps),
                initial_p=1.0,
                final_p=self.exploration_final_eps)

            episode_rewards = [0.0]
            self.cumul_reward = [0.0]
            episode_successes = []
            obs = self.env.reset()
            reset = True
            self.episode_reward = np.zeros((1, ))

            # variables for test eval ##
            test_step = test_t * 3
            test_results = {'sum': []}
            test_ts = []

            for _ in range(total_timesteps):

                ## Test eval period ##
                if learning_curve and _ % test_step == 0 and _ > 0:
                    print("--> Simulating test period")
                    self.env.reset()
                    test_r = 0.0
                    for i in range(test_t):
                        feasible_actions = AllocationEnv.get_feasible_actions(
                            obs["board_config"])
                        action_mask = AllocationEnv.get_action_mask(
                            feasible_actions, self.env.action_space.n)
                        action, _states = self.predict(obs, mask=action_mask)
                        action = AllocationEnv.check_action(
                            obs['board_config'], action)
                        obs, rewards, dones, info = self.env.step(action)
                        test_r += rewards

                    test_results["sum"].append(test_r)
                    test_ts.append(_)
                    self.env.reset()

                    # plot test eval progress
                    plt.plot(test_ts, test_results["sum"])
                    # plt.errorbar(iteration_cuts, results["mean"], yerr=results["std"], fmt='.k')
                    plt.xlabel("Iteration count")
                    plt.ylabel("Total (sum) test reward")
                    plt.savefig("figs/rl-learning-curve-{}.pdf".format(
                        cfg.vals['prj_name']))
                    plt.clf()
                    plt.close()

                    # write test eval progress
                    write_results = {}
                    for k, v in test_results.items():
                        write_results[k] = serialize_floats(v)

                    with open(
                            "output/rl-learning-curve-{}.json".format(
                                cfg.vals['prj_name']), 'w') as f:
                        json.dump(write_results, f)

                if callback is not None:
                    # Only stop training if return value is False, not when it is None. This is for backwards
                    # compatibility with callbacks that have no return statement.
                    if callback(locals(), globals()) is False:
                        break
                # Take action and update exploration to the newest value
                kwargs = {}
                if not self.param_noise:
                    update_eps = self.exploration.value(self.num_timesteps)
                    update_param_noise_threshold = 0.
                else:
                    update_eps = 0.
                    # Compute the threshold such that the KL divergence between perturbed and non-perturbed
                    # policy is comparable to eps-greedy exploration with eps = exploration.value(t).
                    # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017
                    # for detailed explanation.
                    update_param_noise_threshold = \
                        -np.log(1. - self.exploration.value(self.num_timesteps) +
                                self.exploration.value(self.num_timesteps) / float(self.env.action_space.n))
                    kwargs['reset'] = reset
                    kwargs[
                        'update_param_noise_threshold'] = update_param_noise_threshold
                    kwargs['update_param_noise_scale'] = True

                feasible_actions = AllocationEnv.get_feasible_actions(
                    obs["board_config"])
                action_mask = AllocationEnv.get_action_mask(
                    feasible_actions, self.action_space.n)
                with self.sess.as_default():
                    action = self.act(State.get_vec_observation(obs)[None],
                                      update_eps=update_eps,
                                      **kwargs,
                                      mask=action_mask)[0]
                reset = False
                # CHECK IF ACTIONS IS FEASIBLE
                action = AllocationEnv.check_action(obs['board_config'],
                                                    action)
                env_action = action
                new_obs, rew, done, info = self.env.step(env_action)
                print("action: {} - reward: {} - eps: {:.4}".format(
                    action, rew, update_eps))
                print(new_obs['day_vec'])
                print(new_obs['board_config'])
                # Store transition in the replay buffer.
                self.replay_buffer.add(State.get_vec_observation(obs), action,
                                       rew, State.get_vec_observation(new_obs),
                                       float(done))
                obs = new_obs

                if writer is not None:
                    ep_rew = np.array([rew]).reshape((1, -1))
                    ep_done = np.array([done]).reshape((1, -1))
                    self.episode_reward = total_episode_reward_logger(
                        self.episode_reward, ep_rew, ep_done, writer,
                        self.num_timesteps)

                episode_rewards[-1] += rew
                self.cumul_reward.append(self.cumul_reward[-1] + rew)
                if done:
                    maybe_is_success = info.get('is_success')
                    if maybe_is_success is not None:
                        episode_successes.append(float(maybe_is_success))
                    if not isinstance(self.env, VecEnv):
                        obs = self.env.reset()
                    episode_rewards.append(0.0)
                    reset = True

                # Do not train if the warmup phase is not over
                # or if there are not enough samples in the replay buffer
                can_sample = self.replay_buffer.can_sample(self.batch_size)
                if can_sample and self.num_timesteps > self.learning_starts \
                    and self.num_timesteps % self.train_freq == 0:
                    # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
                    if self.prioritized_replay:
                        experience = self.replay_buffer.sample(
                            self.batch_size,
                            beta=self.beta_schedule.value(self.num_timesteps))
                        (obses_t, actions, rewards, obses_tp1, dones, weights,
                         batch_idxes) = experience
                    else:
                        obses_t, actions, rewards, obses_tp1, dones = self.replay_buffer.sample(
                            self.batch_size)
                        weights, batch_idxes = np.ones_like(rewards), None

                    if writer is not None:
                        # run loss backprop with summary, but once every 100 steps save the metadata
                        # (memory, compute time, ...)
                        if (1 + self.num_timesteps) % 100 == 0:
                            run_options = tf.RunOptions(
                                trace_level=tf.RunOptions.FULL_TRACE)
                            run_metadata = tf.RunMetadata()
                            summary, td_errors = self._train_step(
                                obses_t,
                                actions,
                                rewards,
                                obses_tp1,
                                obses_tp1,
                                dones,
                                weights,
                                sess=self.sess,
                                options=run_options,
                                run_metadata=run_metadata)
                            writer.add_run_metadata(
                                run_metadata, 'step%d' % self.num_timesteps)
                        else:
                            summary, td_errors = self._train_step(
                                obses_t,
                                actions,
                                rewards,
                                obses_tp1,
                                obses_tp1,
                                dones,
                                weights,
                                sess=self.sess)
                        writer.add_summary(summary, self.num_timesteps)
                    else:
                        _, td_errors = self._train_step(obses_t,
                                                        actions,
                                                        rewards,
                                                        obses_tp1,
                                                        obses_tp1,
                                                        dones,
                                                        weights,
                                                        sess=self.sess)

                    if self.prioritized_replay:
                        new_priorities = np.abs(
                            td_errors) + self.prioritized_replay_eps
                        self.replay_buffer.update_priorities(
                            batch_idxes, new_priorities)

                if can_sample and self.num_timesteps > self.learning_starts and \
                        self.num_timesteps % self.target_network_update_freq == 0:
                    # Update target network periodically.
                    self.update_target(sess=self.sess)

                if len(episode_rewards[-101:-1]) == 0:
                    mean_100ep_reward = -np.inf
                else:
                    mean_100ep_reward = round(
                        float(np.mean(episode_rewards[-101:-1])), 1)

                num_episodes = len(episode_rewards)
                if self.verbose >= 1 and done and log_interval is not None and len(
                        episode_rewards) % log_interval == 0:
                    logger.record_tabular("steps", self.num_timesteps)
                    logger.record_tabular("episodes", num_episodes)
                    if len(episode_successes) > 0:
                        logger.logkv("success rate",
                                     np.mean(episode_successes[-100:]))
                    logger.record_tabular("mean 100 episode reward",
                                          mean_100ep_reward)
                    logger.record_tabular(
                        "% time spent exploring",
                        int(100 * self.exploration.value(self.num_timesteps)))
                    logger.dump_tabular()
                print('timestamp: {}'.format(self.num_timesteps, end='\r\n'))
                self.num_timesteps += 1

        return self