Exemplo n.º 1
0
def evaluate_policy(policy, env, eval_episodes=10):
    reward_arr = np.zeros(eval_episodes)

    for i in range(eval_episodes):
        obs = env.reset()
        done = False
        total_reward = 0.
        while not done:
            feasible_actions = AllocationEnv.get_feasible_actions(
                obs["board_config"])
            action_mask = AllocationEnv.get_action_mask(
                feasible_actions, env.action_space.n)

            action, _states = policy.predict(obs, mask=action_mask)
            action = AllocationEnv.check_action(obs['board_config'], action)
            obs, reward, done, _ = env.step(action)
            total_reward += reward

        reward_arr[i] = total_reward

    avg_reward = reward_arr.mean()
    std_reward = reward_arr.std()

    print("---------------------------------------")
    print("Evaluation over {} episodes: {:.1f} ({:.2f})".format(
        eval_episodes, avg_reward, std_reward))
    print("---------------------------------------")
    return avg_reward, std_reward
Exemplo n.º 2
0
    def evaluate(self):
        gamma = .8
        rewards = []
        for i in range(self.n_episodes):

            self.queue = self.build_queue(self.buffer)

            r_i = 0
            state, _, _, _, _ = self.buffer.sample(batch_size=1)
            state = state[0]

            iter = 0
            cntr = 0

            while True:
                board_cfg = State.get_board_config_from_vec(
                    state,
                    n_regions=self.n_regions,
                    n_products=self.n_products)
                feasible_actions = AllocationEnv.get_feasible_actions(
                    board_cfg)
                action_mask = AllocationEnv.get_action_mask(
                    feasible_actions, self.n_actions)

                #M = self.get_m(state, action_mask)
                M = 1
                try:
                    _, a, r, s_prime = self.queue[state].pop()
                    #_, a, r, s_prime = self.queue[state][-1]

                except IndexError:
                    break

                alpha = random.random()

                prob_policy = self.policy.proba_step(state.reshape(1, -1),
                                                     mask=action_mask)[0][a]
                prob_env = self.env_policy.predict_proba(state)[a]

                rejection_tol = (1 / M) * prob_policy / prob_env

                iter += 1
                print(f"eps: {i+1} - iter:{iter} - success: {cntr}")

                if alpha > rejection_tol:
                    continue
                else:
                    #self.queue[state].pop()

                    r_i += (gamma)**cntr * r
                    state = s_prime
                    cntr += 1
            if r_i > 0:
                rewards.append(r_i)

        return rewards
Exemplo n.º 3
0
    def learn(self):


        for i in range(self.epochs):
            print(f"Epoch {i}/{self.epochs}")
            pbar = tqdm(range(self.rollout_batch_size))
            for b in pbar:
                #state = self.buffer_env.sample(batch_size=1)[0][0]
                state = self.env_model.reset()
                state = State.get_vec_observation(state)



                for h in range(self.rollout):
                    pbar.set_description(f"batch: {b} rollout: {h}")
                    board_cfg = State.get_board_config_from_vec(state,
                                                                n_regions=self.n_regions,
                                                                n_products=self.n_products
                                                                )

                    feasible_actions = AllocationEnv.get_feasible_actions(board_cfg)
                    #feasible_actions = AllocationEnv.get_feasible_actions(state["board_config"])
                    action_mask = AllocationEnv.get_action_mask(feasible_actions, self.n_actions)

                    # sample action a_j ~ pi(s_j)
                    alpha = random.random()

                    if alpha < self.eps:
                        action = self.env_model.action_space.sample()
                    else:
                        action, _states = self.policy.predict(state.reshape(1, -1), mask=action_mask)

                    # compute dynamics from env model
                    new_state, r_hat, dones, info = self.env_model.step(action)
                    new_state = State.get_vec_observation(new_state)

                    reward = self.get_penalized_reward(r_hat, self.lmbda)


                    # add (s, a, r, s') to buffer
                    self.buffer_model.add(obs_t=state,
                                          action=action,
                                          reward=reward,
                                          obs_tp1=new_state,
                                          done=float(dones))

                    state = new_state



                    # update policy with samples from D_env and D_model
                self.policy.update_weights(self.buffer_model)
        self.save_buffer()
Exemplo n.º 4
0
def map_optimal_rewards(tabu_len, k):
    state = env.reset()
    total_reward = 0
    results = {'rewards': [0.0]}
    optimal_actions = []




    for day in range(TEST_T):
        curr_best_val = 0.0
        curr_best_action = 0.0

        curr_state = copy.deepcopy(env.state)
        feasible_actions = AllocationEnv.get_feasible_actions(curr_state.board_config)


        for action in feasible_actions:

            print("Iteration: {}, Action: {}".format(day, action), end='\r')
            action = AllocationEnv.check_action(curr_state.board_config, action)
            proposed_state, reward, b, i = env.step(action)
            env.set_state(curr_state)

            if reward > curr_best_val:
                curr_best_action = action


        optimal_actions.append(curr_best_action)
        curr_best_action = AllocationEnv.check_action(curr_state.board_config, curr_best_action)

        state, final_reward, _ , _ = env.step(curr_best_action)  # update the state after each day based on the optimal action taken

        total_reward += final_reward
        curr_best_val = final_reward
        results['rewards'].append(total_reward)
        print("best action: {} - reward: {}".format(curr_best_action, final_reward))
        print("total reward: {}".format(total_reward))


    return state, optimal_actions, results
Exemplo n.º 5
0
def main(args):

    store_id = get_store_id(cfg.vals["train_data"])
    hyp = {
        "epochs": args.epochs,
        "rollout batch size": args.rollout_batch_size,
        "parameter updates": args.epochs * args.rollout_batch_size,
        "rollouts": args.rollouts,
        "lambda": args.lmbda,
        "batch size": args.batch_size,
        "posterior samples": args.posterior_samples,
        "episode length": cfg.vals["episode_len"],
        "n simulations": args.eval_eps,
        "store": store_id,
        "eps": args.eps
    }

    logger = Logger(hyp, "./results/", "pc_mopo")
    logger.write()

    prior = Prior(config=cfg.vals)
    env_model = AllocationEnv(config=cfg.vals,
                              prior=prior,
                              load_model=True,
                              full_posterior=True,
                              posterior_samples=args.posterior_samples,
                              verbose=False)

    policy = DQN(MlpPolicy, env_model, batch_size=args.batch_size)

    mopo_dqn = Mopo(
        policy=policy,
        env_model=env_model,
        rollout_batch_size=args.rollout_batch_size,
        epochs=args.epochs,
        rollout=args.rollouts,
        n_actions=env_model.n_actions,
        lmbda=args.lmbda,
        buffer_path=f"../data/{store_id}-buffer-d-trn.p",
        # buffer_path=None
        eps=args.eps)

    mopo_dqn.learn()

    if os.path.exists(f"./models/{store_id}-{args.file_name}"):
        os.remove(f"./models/{store_id}-{args.file_name}")
    mopo_dqn.policy.save(f"./models/{store_id}-{args.file_name}")
Exemplo n.º 6
0
def map_optimal_rewards():
    state = env.reset()
    total_reward = 0
    results = {'rewards': [0.0]}
    optimal_actions = []

    curr_action = 0

    for day in range(TEST_T):

        curr_state = copy.deepcopy(env.state)
        feasible_actions = AllocationEnv.get_feasible_actions(
            curr_state.board_config)
        proposed_action = np.random.choice(list(feasible_actions))

        curr_state_step, curr_reward, b, i = env.step(curr_action)
        env.set_state(curr_state)

        proposed_state, proposed_reward, b, i = env.step(proposed_action)

        curr_f = get_f(ae=-curr_reward, lmbda=LMBDA, log=True, T=T)
        proposed_f = get_f(ae=-proposed_reward, lmbda=LMBDA, log=True, T=T)

        gamma = get_gamma(f_current=curr_f, f_proposed=proposed_f, log=True)
        # Generate random number on log scale
        sr = np.log(random.random())

        if sr < gamma:  # made progress
            #state, final_reward, _, _ = env.step(curr_best_action)  # update the state after each day based on the optimal action taken
            optimal_actions.append(proposed_action)
            curr_best_action = proposed_action
            final_reward = proposed_reward

        else:
            optimal_actions.append(curr_action)
            state, final_reward, _, _ = env.step(curr_action)
            curr_best_action = curr_action

        total_reward += final_reward
        results['rewards'].append(total_reward)
        print("best action: {} - reward: {}".format(curr_best_action,
                                                    final_reward))
        print("total reward: {}".format(total_reward))

    return state, optimal_actions, results
Exemplo n.º 7
0
    def sample(self, board_cfg, prod_next, idx_to_prod):

        keys = list(self.action_space)[1:]

        while True:
            #a_idx, action = np.random.choice(keys)
            a_idx = np.random.choice(keys)

            a_idx = AllocationEnv.check_action(board_cfg, a_idx)

            if a_idx >= 0:
                mtx_idx, action = self.action_space[a_idx]

                if a_idx == 0 or idx_to_prod[mtx_idx[1]] in prod_next:
                    break

        a_mtx = np.zeros((self.n_regions, self.n_products))
        a_mtx[mtx_idx] = action
        return a_mtx, a_idx
Exemplo n.º 8
0
def get_simple_simulator(config):
    """

    :param config:
    :return:
    """
    simulator_cfg = {k: v for k, v in config.items()}
    simulator_cfg["model_type"] = "hierarchical"
    sim_nam = simulator_cfg['model_type'] + "-" + simulator_cfg[
        'train_data'].split("/")[-1].split(".")[0] + "-no-precision" + ".p"
    simulator_cfg["model_path"] = os.path.join(config["prj_root"], "envs",
                                               sim_nam)

    simulator_prior = Prior(simulator_cfg)
    simulator = AllocationEnv(config=simulator_cfg,
                              prior=simulator_prior,
                              load_model=True)

    return simulator
Exemplo n.º 9
0
from policies.deepq.dqn import DQN
from utils import serialize_floats
import json
import pickle

from utils import get_store_id
from stable_baselines.deepq.replay_buffer import ReplayBuffer
from envs.state import State

TEST_T = cfg.vals["episode_len"]
TIME_STEPS = 1000
LEARNING_START = 200

store_id = get_store_id(cfg.vals["train_data"])
prior = Prior(config=cfg.vals)
env = AllocationEnv(config=cfg.vals, prior=prior, load_model=True)
n_actions = env.n_actions
env = DummyVecEnv([lambda: env
                   ])  # The algorithms require a vectorized environment to run

model = DQN(MlpPolicy,
            env,
            verbose=2,
            learning_starts=LEARNING_START,
            gamma=.2,
            exploration_fraction=0.35,
            exploration_final_eps=0.2)
model.learn(total_timesteps=TIME_STEPS, learning_curve=False, test_t=TEST_T)

with open(f"../data/{store_id}-buffer-d-test.p", 'wb') as f:
    pickle.dump(model.replay_buffer, f)
Exemplo n.º 10
0
                                          action=action,
                                          reward=reward,
                                          obs_tp1=new_state,
                                          done=float(dones))

                    state = new_state



                    # update policy with samples from D_env and D_model
                self.policy.update_weights(self.buffer_model)
        self.save_buffer()


if __name__ == "__main__":
    prior = Prior(config=cfg.vals)
    env = AllocationEnv(config=cfg.vals, prior=prior, load_model=True, full_posterior=True)
    policy = DQN(MlpPolicy, env, batch_size=32)

    mopo = Mopo(policy=policy,
                env_model=env,
                rollout_batch_size=10,
                epochs=100,
                rollout=10,
                n_actions = env.n_actions,
                lmbda=1e-3,
                buffer_path="../data/random-buffer.p"

    )

    mopo.learn()
Exemplo n.º 11
0
import os
import sys
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
from envs.prior import Prior
from envs.allocation_env import AllocationEnv
from envs.state import State
from envs.features import Features
import config.config as cfg
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import matplotlib as mpl

prior = Prior(config=cfg.vals)
env = AllocationEnv(config=cfg.vals, prior=prior, load_model=True)
env.reset()

print("n comp: {}".format(env.state.board_config.sum()))

# get seen state

ts_train = env.time_stamps.container.data
ts_unique = np.unique(ts_train)
ts = np.random.choice(ts_unique, )

idx = np.where(ts_train == ts)[0]

p_train = env.X_product.container.data
r_train = env.X_region.container.data

p_state = p_train[idx, :]
Exemplo n.º 12
0
import sys
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))

from utils import get_store_id

from envs.prior import Prior
from envs.allocation_env import AllocationEnv
import config.config as cfg
from utils import serialize_floats
from stable_baselines.deepq.replay_buffer import ReplayBuffer
from envs.state import State

store_id = get_store_id(cfg.vals["train_data"])
TIME_STEPS = cfg.vals["episode_len"]
prior = Prior(config=cfg.vals)
env = AllocationEnv(config=cfg.vals, prior=prior, load_model=True)
results = {'rewards': [0.0]}
buffer = ReplayBuffer(size=50000)

obs = env.reset()
for i in range(TIME_STEPS):
    action = env.action_space.sample()
    proposed_action = AllocationEnv.check_action(obs['board_config'], action)
    new_obs, rew, dones, info = env.step(proposed_action)

    if rew == -1:
        action = 0

    print("Timestep: {}".format(i))
    print("action: {} - reward: {}".format(action, rew))
    print(obs['day_vec'])
Exemplo n.º 13
0
import os
import sys
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
from envs.prior import Prior
import config.config as cfg
from envs.allocation_env import AllocationEnv

N_ITER = int(sys.argv[1])
N_SAMPLES = int(sys.argv[2])
LOAD_MODEL = False

prior = Prior(config=cfg.vals)

env = AllocationEnv(config=cfg.vals, prior=prior, load_model=LOAD_MODEL)
y_hat = env.train(n_iter=N_ITER,
                  n_samples=N_SAMPLES,
                  fname=cfg.vals['model_path'],
                  debug=False)
Exemplo n.º 14
0
    def learn(self,
              total_timesteps,
              callback=None,
              seed=None,
              log_interval=100,
              tb_log_name="DQN",
              reset_num_timesteps=True,
              replay_wrapper=None,
              learning_curve=False,
              test_t=None):

        new_tb_log = self._init_num_timesteps(reset_num_timesteps)

        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
                as writer:
            self._setup_learn(seed)

            # Create the replay buffer
            if self.prioritized_replay:
                self.replay_buffer = PrioritizedReplayBuffer(
                    self.buffer_size, alpha=self.prioritized_replay_alpha)
                if self.prioritized_replay_beta_iters is None:
                    prioritized_replay_beta_iters = total_timesteps
                else:
                    prioritized_replay_beta_iters = self.prioritized_replay_beta_iters
                self.beta_schedule = LinearSchedule(
                    prioritized_replay_beta_iters,
                    initial_p=self.prioritized_replay_beta0,
                    final_p=1.0)
            else:
                self.replay_buffer = ReplayBuffer(self.buffer_size)
                self.beta_schedule = None

            if replay_wrapper is not None:
                assert not self.prioritized_replay, "Prioritized replay buffer is not supported by HER"
                self.replay_buffer = replay_wrapper(self.replay_buffer)

            # Create the schedule for exploration starting from 1.
            self.exploration = LinearSchedule(
                schedule_timesteps=int(self.exploration_fraction *
                                       total_timesteps),
                initial_p=1.0,
                final_p=self.exploration_final_eps)

            episode_rewards = [0.0]
            self.cumul_reward = [0.0]
            episode_successes = []
            obs = self.env.reset()
            reset = True
            self.episode_reward = np.zeros((1, ))

            # variables for test eval ##
            test_step = test_t * 3
            test_results = {'sum': []}
            test_ts = []

            for _ in range(total_timesteps):

                ## Test eval period ##
                if learning_curve and _ % test_step == 0 and _ > 0:
                    print("--> Simulating test period")
                    self.env.reset()
                    test_r = 0.0
                    for i in range(test_t):
                        feasible_actions = AllocationEnv.get_feasible_actions(
                            obs["board_config"])
                        action_mask = AllocationEnv.get_action_mask(
                            feasible_actions, self.env.action_space.n)
                        action, _states = self.predict(obs, mask=action_mask)
                        action = AllocationEnv.check_action(
                            obs['board_config'], action)
                        obs, rewards, dones, info = self.env.step(action)
                        test_r += rewards

                    test_results["sum"].append(test_r)
                    test_ts.append(_)
                    self.env.reset()

                    # plot test eval progress
                    plt.plot(test_ts, test_results["sum"])
                    # plt.errorbar(iteration_cuts, results["mean"], yerr=results["std"], fmt='.k')
                    plt.xlabel("Iteration count")
                    plt.ylabel("Total (sum) test reward")
                    plt.savefig("figs/rl-learning-curve-{}.pdf".format(
                        cfg.vals['prj_name']))
                    plt.clf()
                    plt.close()

                    # write test eval progress
                    write_results = {}
                    for k, v in test_results.items():
                        write_results[k] = serialize_floats(v)

                    with open(
                            "output/rl-learning-curve-{}.json".format(
                                cfg.vals['prj_name']), 'w') as f:
                        json.dump(write_results, f)

                if callback is not None:
                    # Only stop training if return value is False, not when it is None. This is for backwards
                    # compatibility with callbacks that have no return statement.
                    if callback(locals(), globals()) is False:
                        break
                # Take action and update exploration to the newest value
                kwargs = {}
                if not self.param_noise:
                    update_eps = self.exploration.value(self.num_timesteps)
                    update_param_noise_threshold = 0.
                else:
                    update_eps = 0.
                    # Compute the threshold such that the KL divergence between perturbed and non-perturbed
                    # policy is comparable to eps-greedy exploration with eps = exploration.value(t).
                    # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017
                    # for detailed explanation.
                    update_param_noise_threshold = \
                        -np.log(1. - self.exploration.value(self.num_timesteps) +
                                self.exploration.value(self.num_timesteps) / float(self.env.action_space.n))
                    kwargs['reset'] = reset
                    kwargs[
                        'update_param_noise_threshold'] = update_param_noise_threshold
                    kwargs['update_param_noise_scale'] = True

                feasible_actions = AllocationEnv.get_feasible_actions(
                    obs["board_config"])
                action_mask = AllocationEnv.get_action_mask(
                    feasible_actions, self.action_space.n)
                with self.sess.as_default():
                    action = self.act(State.get_vec_observation(obs)[None],
                                      update_eps=update_eps,
                                      **kwargs,
                                      mask=action_mask)[0]
                reset = False
                # CHECK IF ACTIONS IS FEASIBLE
                action = AllocationEnv.check_action(obs['board_config'],
                                                    action)
                env_action = action
                new_obs, rew, done, info = self.env.step(env_action)
                print("action: {} - reward: {} - eps: {:.4}".format(
                    action, rew, update_eps))
                print(new_obs['day_vec'])
                print(new_obs['board_config'])
                # Store transition in the replay buffer.
                self.replay_buffer.add(State.get_vec_observation(obs), action,
                                       rew, State.get_vec_observation(new_obs),
                                       float(done))
                obs = new_obs

                if writer is not None:
                    ep_rew = np.array([rew]).reshape((1, -1))
                    ep_done = np.array([done]).reshape((1, -1))
                    self.episode_reward = total_episode_reward_logger(
                        self.episode_reward, ep_rew, ep_done, writer,
                        self.num_timesteps)

                episode_rewards[-1] += rew
                self.cumul_reward.append(self.cumul_reward[-1] + rew)
                if done:
                    maybe_is_success = info.get('is_success')
                    if maybe_is_success is not None:
                        episode_successes.append(float(maybe_is_success))
                    if not isinstance(self.env, VecEnv):
                        obs = self.env.reset()
                    episode_rewards.append(0.0)
                    reset = True

                # Do not train if the warmup phase is not over
                # or if there are not enough samples in the replay buffer
                can_sample = self.replay_buffer.can_sample(self.batch_size)
                if can_sample and self.num_timesteps > self.learning_starts \
                    and self.num_timesteps % self.train_freq == 0:
                    # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
                    if self.prioritized_replay:
                        experience = self.replay_buffer.sample(
                            self.batch_size,
                            beta=self.beta_schedule.value(self.num_timesteps))
                        (obses_t, actions, rewards, obses_tp1, dones, weights,
                         batch_idxes) = experience
                    else:
                        obses_t, actions, rewards, obses_tp1, dones = self.replay_buffer.sample(
                            self.batch_size)
                        weights, batch_idxes = np.ones_like(rewards), None

                    if writer is not None:
                        # run loss backprop with summary, but once every 100 steps save the metadata
                        # (memory, compute time, ...)
                        if (1 + self.num_timesteps) % 100 == 0:
                            run_options = tf.RunOptions(
                                trace_level=tf.RunOptions.FULL_TRACE)
                            run_metadata = tf.RunMetadata()
                            summary, td_errors = self._train_step(
                                obses_t,
                                actions,
                                rewards,
                                obses_tp1,
                                obses_tp1,
                                dones,
                                weights,
                                sess=self.sess,
                                options=run_options,
                                run_metadata=run_metadata)
                            writer.add_run_metadata(
                                run_metadata, 'step%d' % self.num_timesteps)
                        else:
                            summary, td_errors = self._train_step(
                                obses_t,
                                actions,
                                rewards,
                                obses_tp1,
                                obses_tp1,
                                dones,
                                weights,
                                sess=self.sess)
                        writer.add_summary(summary, self.num_timesteps)
                    else:
                        _, td_errors = self._train_step(obses_t,
                                                        actions,
                                                        rewards,
                                                        obses_tp1,
                                                        obses_tp1,
                                                        dones,
                                                        weights,
                                                        sess=self.sess)

                    if self.prioritized_replay:
                        new_priorities = np.abs(
                            td_errors) + self.prioritized_replay_eps
                        self.replay_buffer.update_priorities(
                            batch_idxes, new_priorities)

                if can_sample and self.num_timesteps > self.learning_starts and \
                        self.num_timesteps % self.target_network_update_freq == 0:
                    # Update target network periodically.
                    self.update_target(sess=self.sess)

                if len(episode_rewards[-101:-1]) == 0:
                    mean_100ep_reward = -np.inf
                else:
                    mean_100ep_reward = round(
                        float(np.mean(episode_rewards[-101:-1])), 1)

                num_episodes = len(episode_rewards)
                if self.verbose >= 1 and done and log_interval is not None and len(
                        episode_rewards) % log_interval == 0:
                    logger.record_tabular("steps", self.num_timesteps)
                    logger.record_tabular("episodes", num_episodes)
                    if len(episode_successes) > 0:
                        logger.logkv("success rate",
                                     np.mean(episode_successes[-100:]))
                    logger.record_tabular("mean 100 episode reward",
                                          mean_100ep_reward)
                    logger.record_tabular(
                        "% time spent exploring",
                        int(100 * self.exploration.value(self.num_timesteps)))
                    logger.dump_tabular()
                print('timestamp: {}'.format(self.num_timesteps, end='\r\n'))
                self.num_timesteps += 1

        return self