def evaluate_policy(policy, env, eval_episodes=10): reward_arr = np.zeros(eval_episodes) for i in range(eval_episodes): obs = env.reset() done = False total_reward = 0. while not done: feasible_actions = AllocationEnv.get_feasible_actions( obs["board_config"]) action_mask = AllocationEnv.get_action_mask( feasible_actions, env.action_space.n) action, _states = policy.predict(obs, mask=action_mask) action = AllocationEnv.check_action(obs['board_config'], action) obs, reward, done, _ = env.step(action) total_reward += reward reward_arr[i] = total_reward avg_reward = reward_arr.mean() std_reward = reward_arr.std() print("---------------------------------------") print("Evaluation over {} episodes: {:.1f} ({:.2f})".format( eval_episodes, avg_reward, std_reward)) print("---------------------------------------") return avg_reward, std_reward
def map_optimal_rewards(tabu_len, k): state = env.reset() total_reward = 0 results = {'rewards': [0.0]} optimal_actions = [] for day in range(TEST_T): curr_best_val = 0.0 curr_best_action = 0.0 curr_state = copy.deepcopy(env.state) feasible_actions = AllocationEnv.get_feasible_actions(curr_state.board_config) for action in feasible_actions: print("Iteration: {}, Action: {}".format(day, action), end='\r') action = AllocationEnv.check_action(curr_state.board_config, action) proposed_state, reward, b, i = env.step(action) env.set_state(curr_state) if reward > curr_best_val: curr_best_action = action optimal_actions.append(curr_best_action) curr_best_action = AllocationEnv.check_action(curr_state.board_config, curr_best_action) state, final_reward, _ , _ = env.step(curr_best_action) # update the state after each day based on the optimal action taken total_reward += final_reward curr_best_val = final_reward results['rewards'].append(total_reward) print("best action: {} - reward: {}".format(curr_best_action, final_reward)) print("total reward: {}".format(total_reward)) return state, optimal_actions, results
def sample(self, board_cfg, prod_next, idx_to_prod): keys = list(self.action_space)[1:] while True: #a_idx, action = np.random.choice(keys) a_idx = np.random.choice(keys) a_idx = AllocationEnv.check_action(board_cfg, a_idx) if a_idx >= 0: mtx_idx, action = self.action_space[a_idx] if a_idx == 0 or idx_to_prod[mtx_idx[1]] in prod_next: break a_mtx = np.zeros((self.n_regions, self.n_products)) a_mtx[mtx_idx] = action return a_mtx, a_idx
results = {'rewards': [0.0]} buffer = ReplayBuffer(size=50000) for j in range(100): obs = env.reset() for i in range(TEST_T): feasible_actions = AllocationEnv.get_feasible_actions( obs["board_config"]) action_mask = AllocationEnv.get_action_mask(feasible_actions, n_actions) action, _states = model.predict(obs, mask=action_mask) action = AllocationEnv.check_action(obs['board_config'], action) new_obs, r, dones, info = env.step([action]) results['rewards'].append(r[0] + results['rewards'][-1]) # add (s, a, r, s') to buffer buffer.add(obs_t=State.get_vec_observation(obs), action=action, reward=r[0], obs_tp1=State.get_vec_observation(new_obs), done=float(dones)) obs = new_obs with open("output/rl-test-{}.json".format(cfg.vals['prj_name']), 'w') as f: json.dump(results, f)
def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="DQN", reset_num_timesteps=True, replay_wrapper=None, learning_curve=False, test_t=None): new_tb_log = self._init_num_timesteps(reset_num_timesteps) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn(seed) # Create the replay buffer if self.prioritized_replay: self.replay_buffer = PrioritizedReplayBuffer( self.buffer_size, alpha=self.prioritized_replay_alpha) if self.prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = total_timesteps else: prioritized_replay_beta_iters = self.prioritized_replay_beta_iters self.beta_schedule = LinearSchedule( prioritized_replay_beta_iters, initial_p=self.prioritized_replay_beta0, final_p=1.0) else: self.replay_buffer = ReplayBuffer(self.buffer_size) self.beta_schedule = None if replay_wrapper is not None: assert not self.prioritized_replay, "Prioritized replay buffer is not supported by HER" self.replay_buffer = replay_wrapper(self.replay_buffer) # Create the schedule for exploration starting from 1. self.exploration = LinearSchedule( schedule_timesteps=int(self.exploration_fraction * total_timesteps), initial_p=1.0, final_p=self.exploration_final_eps) episode_rewards = [0.0] self.cumul_reward = [0.0] episode_successes = [] obs = self.env.reset() reset = True self.episode_reward = np.zeros((1, )) # variables for test eval ## test_step = test_t * 3 test_results = {'sum': []} test_ts = [] for _ in range(total_timesteps): ## Test eval period ## if learning_curve and _ % test_step == 0 and _ > 0: print("--> Simulating test period") self.env.reset() test_r = 0.0 for i in range(test_t): feasible_actions = AllocationEnv.get_feasible_actions( obs["board_config"]) action_mask = AllocationEnv.get_action_mask( feasible_actions, self.env.action_space.n) action, _states = self.predict(obs, mask=action_mask) action = AllocationEnv.check_action( obs['board_config'], action) obs, rewards, dones, info = self.env.step(action) test_r += rewards test_results["sum"].append(test_r) test_ts.append(_) self.env.reset() # plot test eval progress plt.plot(test_ts, test_results["sum"]) # plt.errorbar(iteration_cuts, results["mean"], yerr=results["std"], fmt='.k') plt.xlabel("Iteration count") plt.ylabel("Total (sum) test reward") plt.savefig("figs/rl-learning-curve-{}.pdf".format( cfg.vals['prj_name'])) plt.clf() plt.close() # write test eval progress write_results = {} for k, v in test_results.items(): write_results[k] = serialize_floats(v) with open( "output/rl-learning-curve-{}.json".format( cfg.vals['prj_name']), 'w') as f: json.dump(write_results, f) if callback is not None: # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback(locals(), globals()) is False: break # Take action and update exploration to the newest value kwargs = {} if not self.param_noise: update_eps = self.exploration.value(self.num_timesteps) update_param_noise_threshold = 0. else: update_eps = 0. # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = \ -np.log(1. - self.exploration.value(self.num_timesteps) + self.exploration.value(self.num_timesteps) / float(self.env.action_space.n)) kwargs['reset'] = reset kwargs[ 'update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True feasible_actions = AllocationEnv.get_feasible_actions( obs["board_config"]) action_mask = AllocationEnv.get_action_mask( feasible_actions, self.action_space.n) with self.sess.as_default(): action = self.act(State.get_vec_observation(obs)[None], update_eps=update_eps, **kwargs, mask=action_mask)[0] reset = False # CHECK IF ACTIONS IS FEASIBLE action = AllocationEnv.check_action(obs['board_config'], action) env_action = action new_obs, rew, done, info = self.env.step(env_action) print("action: {} - reward: {} - eps: {:.4}".format( action, rew, update_eps)) print(new_obs['day_vec']) print(new_obs['board_config']) # Store transition in the replay buffer. self.replay_buffer.add(State.get_vec_observation(obs), action, rew, State.get_vec_observation(new_obs), float(done)) obs = new_obs if writer is not None: ep_rew = np.array([rew]).reshape((1, -1)) ep_done = np.array([done]).reshape((1, -1)) self.episode_reward = total_episode_reward_logger( self.episode_reward, ep_rew, ep_done, writer, self.num_timesteps) episode_rewards[-1] += rew self.cumul_reward.append(self.cumul_reward[-1] + rew) if done: maybe_is_success = info.get('is_success') if maybe_is_success is not None: episode_successes.append(float(maybe_is_success)) if not isinstance(self.env, VecEnv): obs = self.env.reset() episode_rewards.append(0.0) reset = True # Do not train if the warmup phase is not over # or if there are not enough samples in the replay buffer can_sample = self.replay_buffer.can_sample(self.batch_size) if can_sample and self.num_timesteps > self.learning_starts \ and self.num_timesteps % self.train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if self.prioritized_replay: experience = self.replay_buffer.sample( self.batch_size, beta=self.beta_schedule.value(self.num_timesteps)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = self.replay_buffer.sample( self.batch_size) weights, batch_idxes = np.ones_like(rewards), None if writer is not None: # run loss backprop with summary, but once every 100 steps save the metadata # (memory, compute time, ...) if (1 + self.num_timesteps) % 100 == 0: run_options = tf.RunOptions( trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() summary, td_errors = self._train_step( obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess, options=run_options, run_metadata=run_metadata) writer.add_run_metadata( run_metadata, 'step%d' % self.num_timesteps) else: summary, td_errors = self._train_step( obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess) writer.add_summary(summary, self.num_timesteps) else: _, td_errors = self._train_step(obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess) if self.prioritized_replay: new_priorities = np.abs( td_errors) + self.prioritized_replay_eps self.replay_buffer.update_priorities( batch_idxes, new_priorities) if can_sample and self.num_timesteps > self.learning_starts and \ self.num_timesteps % self.target_network_update_freq == 0: # Update target network periodically. self.update_target(sess=self.sess) if len(episode_rewards[-101:-1]) == 0: mean_100ep_reward = -np.inf else: mean_100ep_reward = round( float(np.mean(episode_rewards[-101:-1])), 1) num_episodes = len(episode_rewards) if self.verbose >= 1 and done and log_interval is not None and len( episode_rewards) % log_interval == 0: logger.record_tabular("steps", self.num_timesteps) logger.record_tabular("episodes", num_episodes) if len(episode_successes) > 0: logger.logkv("success rate", np.mean(episode_successes[-100:])) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular( "% time spent exploring", int(100 * self.exploration.value(self.num_timesteps))) logger.dump_tabular() print('timestamp: {}'.format(self.num_timesteps, end='\r\n')) self.num_timesteps += 1 return self