def evaluate_policy(policy, env, eval_episodes=10): reward_arr = np.zeros(eval_episodes) for i in range(eval_episodes): obs = env.reset() done = False total_reward = 0. while not done: feasible_actions = AllocationEnv.get_feasible_actions( obs["board_config"]) action_mask = AllocationEnv.get_action_mask( feasible_actions, env.action_space.n) action, _states = policy.predict(obs, mask=action_mask) action = AllocationEnv.check_action(obs['board_config'], action) obs, reward, done, _ = env.step(action) total_reward += reward reward_arr[i] = total_reward avg_reward = reward_arr.mean() std_reward = reward_arr.std() print("---------------------------------------") print("Evaluation over {} episodes: {:.1f} ({:.2f})".format( eval_episodes, avg_reward, std_reward)) print("---------------------------------------") return avg_reward, std_reward
def evaluate(self): gamma = .8 rewards = [] for i in range(self.n_episodes): self.queue = self.build_queue(self.buffer) r_i = 0 state, _, _, _, _ = self.buffer.sample(batch_size=1) state = state[0] iter = 0 cntr = 0 while True: board_cfg = State.get_board_config_from_vec( state, n_regions=self.n_regions, n_products=self.n_products) feasible_actions = AllocationEnv.get_feasible_actions( board_cfg) action_mask = AllocationEnv.get_action_mask( feasible_actions, self.n_actions) #M = self.get_m(state, action_mask) M = 1 try: _, a, r, s_prime = self.queue[state].pop() #_, a, r, s_prime = self.queue[state][-1] except IndexError: break alpha = random.random() prob_policy = self.policy.proba_step(state.reshape(1, -1), mask=action_mask)[0][a] prob_env = self.env_policy.predict_proba(state)[a] rejection_tol = (1 / M) * prob_policy / prob_env iter += 1 print(f"eps: {i+1} - iter:{iter} - success: {cntr}") if alpha > rejection_tol: continue else: #self.queue[state].pop() r_i += (gamma)**cntr * r state = s_prime cntr += 1 if r_i > 0: rewards.append(r_i) return rewards
def learn(self): for i in range(self.epochs): print(f"Epoch {i}/{self.epochs}") pbar = tqdm(range(self.rollout_batch_size)) for b in pbar: #state = self.buffer_env.sample(batch_size=1)[0][0] state = self.env_model.reset() state = State.get_vec_observation(state) for h in range(self.rollout): pbar.set_description(f"batch: {b} rollout: {h}") board_cfg = State.get_board_config_from_vec(state, n_regions=self.n_regions, n_products=self.n_products ) feasible_actions = AllocationEnv.get_feasible_actions(board_cfg) #feasible_actions = AllocationEnv.get_feasible_actions(state["board_config"]) action_mask = AllocationEnv.get_action_mask(feasible_actions, self.n_actions) # sample action a_j ~ pi(s_j) alpha = random.random() if alpha < self.eps: action = self.env_model.action_space.sample() else: action, _states = self.policy.predict(state.reshape(1, -1), mask=action_mask) # compute dynamics from env model new_state, r_hat, dones, info = self.env_model.step(action) new_state = State.get_vec_observation(new_state) reward = self.get_penalized_reward(r_hat, self.lmbda) # add (s, a, r, s') to buffer self.buffer_model.add(obs_t=state, action=action, reward=reward, obs_tp1=new_state, done=float(dones)) state = new_state # update policy with samples from D_env and D_model self.policy.update_weights(self.buffer_model) self.save_buffer()
def map_optimal_rewards(tabu_len, k): state = env.reset() total_reward = 0 results = {'rewards': [0.0]} optimal_actions = [] for day in range(TEST_T): curr_best_val = 0.0 curr_best_action = 0.0 curr_state = copy.deepcopy(env.state) feasible_actions = AllocationEnv.get_feasible_actions(curr_state.board_config) for action in feasible_actions: print("Iteration: {}, Action: {}".format(day, action), end='\r') action = AllocationEnv.check_action(curr_state.board_config, action) proposed_state, reward, b, i = env.step(action) env.set_state(curr_state) if reward > curr_best_val: curr_best_action = action optimal_actions.append(curr_best_action) curr_best_action = AllocationEnv.check_action(curr_state.board_config, curr_best_action) state, final_reward, _ , _ = env.step(curr_best_action) # update the state after each day based on the optimal action taken total_reward += final_reward curr_best_val = final_reward results['rewards'].append(total_reward) print("best action: {} - reward: {}".format(curr_best_action, final_reward)) print("total reward: {}".format(total_reward)) return state, optimal_actions, results
def main(args): store_id = get_store_id(cfg.vals["train_data"]) hyp = { "epochs": args.epochs, "rollout batch size": args.rollout_batch_size, "parameter updates": args.epochs * args.rollout_batch_size, "rollouts": args.rollouts, "lambda": args.lmbda, "batch size": args.batch_size, "posterior samples": args.posterior_samples, "episode length": cfg.vals["episode_len"], "n simulations": args.eval_eps, "store": store_id, "eps": args.eps } logger = Logger(hyp, "./results/", "pc_mopo") logger.write() prior = Prior(config=cfg.vals) env_model = AllocationEnv(config=cfg.vals, prior=prior, load_model=True, full_posterior=True, posterior_samples=args.posterior_samples, verbose=False) policy = DQN(MlpPolicy, env_model, batch_size=args.batch_size) mopo_dqn = Mopo( policy=policy, env_model=env_model, rollout_batch_size=args.rollout_batch_size, epochs=args.epochs, rollout=args.rollouts, n_actions=env_model.n_actions, lmbda=args.lmbda, buffer_path=f"../data/{store_id}-buffer-d-trn.p", # buffer_path=None eps=args.eps) mopo_dqn.learn() if os.path.exists(f"./models/{store_id}-{args.file_name}"): os.remove(f"./models/{store_id}-{args.file_name}") mopo_dqn.policy.save(f"./models/{store_id}-{args.file_name}")
def map_optimal_rewards(): state = env.reset() total_reward = 0 results = {'rewards': [0.0]} optimal_actions = [] curr_action = 0 for day in range(TEST_T): curr_state = copy.deepcopy(env.state) feasible_actions = AllocationEnv.get_feasible_actions( curr_state.board_config) proposed_action = np.random.choice(list(feasible_actions)) curr_state_step, curr_reward, b, i = env.step(curr_action) env.set_state(curr_state) proposed_state, proposed_reward, b, i = env.step(proposed_action) curr_f = get_f(ae=-curr_reward, lmbda=LMBDA, log=True, T=T) proposed_f = get_f(ae=-proposed_reward, lmbda=LMBDA, log=True, T=T) gamma = get_gamma(f_current=curr_f, f_proposed=proposed_f, log=True) # Generate random number on log scale sr = np.log(random.random()) if sr < gamma: # made progress #state, final_reward, _, _ = env.step(curr_best_action) # update the state after each day based on the optimal action taken optimal_actions.append(proposed_action) curr_best_action = proposed_action final_reward = proposed_reward else: optimal_actions.append(curr_action) state, final_reward, _, _ = env.step(curr_action) curr_best_action = curr_action total_reward += final_reward results['rewards'].append(total_reward) print("best action: {} - reward: {}".format(curr_best_action, final_reward)) print("total reward: {}".format(total_reward)) return state, optimal_actions, results
def sample(self, board_cfg, prod_next, idx_to_prod): keys = list(self.action_space)[1:] while True: #a_idx, action = np.random.choice(keys) a_idx = np.random.choice(keys) a_idx = AllocationEnv.check_action(board_cfg, a_idx) if a_idx >= 0: mtx_idx, action = self.action_space[a_idx] if a_idx == 0 or idx_to_prod[mtx_idx[1]] in prod_next: break a_mtx = np.zeros((self.n_regions, self.n_products)) a_mtx[mtx_idx] = action return a_mtx, a_idx
def get_simple_simulator(config): """ :param config: :return: """ simulator_cfg = {k: v for k, v in config.items()} simulator_cfg["model_type"] = "hierarchical" sim_nam = simulator_cfg['model_type'] + "-" + simulator_cfg[ 'train_data'].split("/")[-1].split(".")[0] + "-no-precision" + ".p" simulator_cfg["model_path"] = os.path.join(config["prj_root"], "envs", sim_nam) simulator_prior = Prior(simulator_cfg) simulator = AllocationEnv(config=simulator_cfg, prior=simulator_prior, load_model=True) return simulator
from policies.deepq.dqn import DQN from utils import serialize_floats import json import pickle from utils import get_store_id from stable_baselines.deepq.replay_buffer import ReplayBuffer from envs.state import State TEST_T = cfg.vals["episode_len"] TIME_STEPS = 1000 LEARNING_START = 200 store_id = get_store_id(cfg.vals["train_data"]) prior = Prior(config=cfg.vals) env = AllocationEnv(config=cfg.vals, prior=prior, load_model=True) n_actions = env.n_actions env = DummyVecEnv([lambda: env ]) # The algorithms require a vectorized environment to run model = DQN(MlpPolicy, env, verbose=2, learning_starts=LEARNING_START, gamma=.2, exploration_fraction=0.35, exploration_final_eps=0.2) model.learn(total_timesteps=TIME_STEPS, learning_curve=False, test_t=TEST_T) with open(f"../data/{store_id}-buffer-d-test.p", 'wb') as f: pickle.dump(model.replay_buffer, f)
action=action, reward=reward, obs_tp1=new_state, done=float(dones)) state = new_state # update policy with samples from D_env and D_model self.policy.update_weights(self.buffer_model) self.save_buffer() if __name__ == "__main__": prior = Prior(config=cfg.vals) env = AllocationEnv(config=cfg.vals, prior=prior, load_model=True, full_posterior=True) policy = DQN(MlpPolicy, env, batch_size=32) mopo = Mopo(policy=policy, env_model=env, rollout_batch_size=10, epochs=100, rollout=10, n_actions = env.n_actions, lmbda=1e-3, buffer_path="../data/random-buffer.p" ) mopo.learn()
import os import sys sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) from envs.prior import Prior from envs.allocation_env import AllocationEnv from envs.state import State from envs.features import Features import config.config as cfg import matplotlib.pyplot as plt import seaborn as sns import numpy as np import matplotlib as mpl prior = Prior(config=cfg.vals) env = AllocationEnv(config=cfg.vals, prior=prior, load_model=True) env.reset() print("n comp: {}".format(env.state.board_config.sum())) # get seen state ts_train = env.time_stamps.container.data ts_unique = np.unique(ts_train) ts = np.random.choice(ts_unique, ) idx = np.where(ts_train == ts)[0] p_train = env.X_product.container.data r_train = env.X_region.container.data p_state = p_train[idx, :]
import sys sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) from utils import get_store_id from envs.prior import Prior from envs.allocation_env import AllocationEnv import config.config as cfg from utils import serialize_floats from stable_baselines.deepq.replay_buffer import ReplayBuffer from envs.state import State store_id = get_store_id(cfg.vals["train_data"]) TIME_STEPS = cfg.vals["episode_len"] prior = Prior(config=cfg.vals) env = AllocationEnv(config=cfg.vals, prior=prior, load_model=True) results = {'rewards': [0.0]} buffer = ReplayBuffer(size=50000) obs = env.reset() for i in range(TIME_STEPS): action = env.action_space.sample() proposed_action = AllocationEnv.check_action(obs['board_config'], action) new_obs, rew, dones, info = env.step(proposed_action) if rew == -1: action = 0 print("Timestep: {}".format(i)) print("action: {} - reward: {}".format(action, rew)) print(obs['day_vec'])
import os import sys sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) from envs.prior import Prior import config.config as cfg from envs.allocation_env import AllocationEnv N_ITER = int(sys.argv[1]) N_SAMPLES = int(sys.argv[2]) LOAD_MODEL = False prior = Prior(config=cfg.vals) env = AllocationEnv(config=cfg.vals, prior=prior, load_model=LOAD_MODEL) y_hat = env.train(n_iter=N_ITER, n_samples=N_SAMPLES, fname=cfg.vals['model_path'], debug=False)
def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="DQN", reset_num_timesteps=True, replay_wrapper=None, learning_curve=False, test_t=None): new_tb_log = self._init_num_timesteps(reset_num_timesteps) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn(seed) # Create the replay buffer if self.prioritized_replay: self.replay_buffer = PrioritizedReplayBuffer( self.buffer_size, alpha=self.prioritized_replay_alpha) if self.prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = total_timesteps else: prioritized_replay_beta_iters = self.prioritized_replay_beta_iters self.beta_schedule = LinearSchedule( prioritized_replay_beta_iters, initial_p=self.prioritized_replay_beta0, final_p=1.0) else: self.replay_buffer = ReplayBuffer(self.buffer_size) self.beta_schedule = None if replay_wrapper is not None: assert not self.prioritized_replay, "Prioritized replay buffer is not supported by HER" self.replay_buffer = replay_wrapper(self.replay_buffer) # Create the schedule for exploration starting from 1. self.exploration = LinearSchedule( schedule_timesteps=int(self.exploration_fraction * total_timesteps), initial_p=1.0, final_p=self.exploration_final_eps) episode_rewards = [0.0] self.cumul_reward = [0.0] episode_successes = [] obs = self.env.reset() reset = True self.episode_reward = np.zeros((1, )) # variables for test eval ## test_step = test_t * 3 test_results = {'sum': []} test_ts = [] for _ in range(total_timesteps): ## Test eval period ## if learning_curve and _ % test_step == 0 and _ > 0: print("--> Simulating test period") self.env.reset() test_r = 0.0 for i in range(test_t): feasible_actions = AllocationEnv.get_feasible_actions( obs["board_config"]) action_mask = AllocationEnv.get_action_mask( feasible_actions, self.env.action_space.n) action, _states = self.predict(obs, mask=action_mask) action = AllocationEnv.check_action( obs['board_config'], action) obs, rewards, dones, info = self.env.step(action) test_r += rewards test_results["sum"].append(test_r) test_ts.append(_) self.env.reset() # plot test eval progress plt.plot(test_ts, test_results["sum"]) # plt.errorbar(iteration_cuts, results["mean"], yerr=results["std"], fmt='.k') plt.xlabel("Iteration count") plt.ylabel("Total (sum) test reward") plt.savefig("figs/rl-learning-curve-{}.pdf".format( cfg.vals['prj_name'])) plt.clf() plt.close() # write test eval progress write_results = {} for k, v in test_results.items(): write_results[k] = serialize_floats(v) with open( "output/rl-learning-curve-{}.json".format( cfg.vals['prj_name']), 'w') as f: json.dump(write_results, f) if callback is not None: # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback(locals(), globals()) is False: break # Take action and update exploration to the newest value kwargs = {} if not self.param_noise: update_eps = self.exploration.value(self.num_timesteps) update_param_noise_threshold = 0. else: update_eps = 0. # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = \ -np.log(1. - self.exploration.value(self.num_timesteps) + self.exploration.value(self.num_timesteps) / float(self.env.action_space.n)) kwargs['reset'] = reset kwargs[ 'update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True feasible_actions = AllocationEnv.get_feasible_actions( obs["board_config"]) action_mask = AllocationEnv.get_action_mask( feasible_actions, self.action_space.n) with self.sess.as_default(): action = self.act(State.get_vec_observation(obs)[None], update_eps=update_eps, **kwargs, mask=action_mask)[0] reset = False # CHECK IF ACTIONS IS FEASIBLE action = AllocationEnv.check_action(obs['board_config'], action) env_action = action new_obs, rew, done, info = self.env.step(env_action) print("action: {} - reward: {} - eps: {:.4}".format( action, rew, update_eps)) print(new_obs['day_vec']) print(new_obs['board_config']) # Store transition in the replay buffer. self.replay_buffer.add(State.get_vec_observation(obs), action, rew, State.get_vec_observation(new_obs), float(done)) obs = new_obs if writer is not None: ep_rew = np.array([rew]).reshape((1, -1)) ep_done = np.array([done]).reshape((1, -1)) self.episode_reward = total_episode_reward_logger( self.episode_reward, ep_rew, ep_done, writer, self.num_timesteps) episode_rewards[-1] += rew self.cumul_reward.append(self.cumul_reward[-1] + rew) if done: maybe_is_success = info.get('is_success') if maybe_is_success is not None: episode_successes.append(float(maybe_is_success)) if not isinstance(self.env, VecEnv): obs = self.env.reset() episode_rewards.append(0.0) reset = True # Do not train if the warmup phase is not over # or if there are not enough samples in the replay buffer can_sample = self.replay_buffer.can_sample(self.batch_size) if can_sample and self.num_timesteps > self.learning_starts \ and self.num_timesteps % self.train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if self.prioritized_replay: experience = self.replay_buffer.sample( self.batch_size, beta=self.beta_schedule.value(self.num_timesteps)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = self.replay_buffer.sample( self.batch_size) weights, batch_idxes = np.ones_like(rewards), None if writer is not None: # run loss backprop with summary, but once every 100 steps save the metadata # (memory, compute time, ...) if (1 + self.num_timesteps) % 100 == 0: run_options = tf.RunOptions( trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() summary, td_errors = self._train_step( obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess, options=run_options, run_metadata=run_metadata) writer.add_run_metadata( run_metadata, 'step%d' % self.num_timesteps) else: summary, td_errors = self._train_step( obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess) writer.add_summary(summary, self.num_timesteps) else: _, td_errors = self._train_step(obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess) if self.prioritized_replay: new_priorities = np.abs( td_errors) + self.prioritized_replay_eps self.replay_buffer.update_priorities( batch_idxes, new_priorities) if can_sample and self.num_timesteps > self.learning_starts and \ self.num_timesteps % self.target_network_update_freq == 0: # Update target network periodically. self.update_target(sess=self.sess) if len(episode_rewards[-101:-1]) == 0: mean_100ep_reward = -np.inf else: mean_100ep_reward = round( float(np.mean(episode_rewards[-101:-1])), 1) num_episodes = len(episode_rewards) if self.verbose >= 1 and done and log_interval is not None and len( episode_rewards) % log_interval == 0: logger.record_tabular("steps", self.num_timesteps) logger.record_tabular("episodes", num_episodes) if len(episode_successes) > 0: logger.logkv("success rate", np.mean(episode_successes[-100:])) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular( "% time spent exploring", int(100 * self.exploration.value(self.num_timesteps))) logger.dump_tabular() print('timestamp: {}'.format(self.num_timesteps, end='\r\n')) self.num_timesteps += 1 return self