def evaluate_inferred_reward(reward_irl, inferred_rewards, image_irl, start_states_irl, horizon): """ Calculates 1-%regret for each (reward, inferred_reward) pair. Regret is amt of reward lost by planning with the inferred reward, rather than planning with the true reward. reward_irl(list): list of true rewards (n, imsize, imsize) inferred_rewards(list): list of inferred rewards (n, imsize, imsize) image_irl(list): list of walls for each grid (n, imsize, imsize): start_states_irl(list): list of start states (n, 2) where each entry of form [x,y] horizon(integer): number of steps to evaluate inferred reward Returns 1-%regret """ reward_percents = [] for label, reward, wall, start_state, i in zip(reward_irl, inferred_rewards, image_irl, start_states_irl, range(len(reward_irl))): if i < 10: plot_reward(label, reward, wall, 'reward_pics/reward_{}'.format(i)) percent = evaluate_proxy(wall, start_state, reward, label, episode_length=horizon) print("Reward had: {}".format(percent)) reward_percents.append(percent) average_percent_reward = float(sum(reward_percents)) / len(reward_percents) print(reward_percents[:10]) print('On average planning with the inferred rewards is ' + str(100 * average_percent_reward) + '% as good as planning with the true rewards') return average_percent_reward
def main(_): env_name = XMAGICAL_EMBODIMENT_TO_ENV_NAME[FLAGS.embodiment] env = utils.make_env(env_name, seed=0) # Reward learning wrapper. if FLAGS.config.reward_wrapper.pretrained_path is not None: env = utils.wrap_learned_reward(env, FLAGS.config) viewer = KeyboardEnvInteractor(action_dim=env.action_space.shape[0]) env.reset() obs = env.render("rgb_array") viewer.imshow(obs) i = [0] rews = [] def step(action): obs, rew, done, info = env.step(action) rews.append(rew) if obs.ndim != 3: obs = env.render("rgb_array") if done: print(f"Done, score {info['eval_score']:.2f}/1.00") print("Episode metrics: ") for k, v in info["episode"].items(): print(f"\t{k}: {v}") if FLAGS.exit_on_done: return i[0] += 1 return obs viewer.run_loop(step) utils.plot_reward(rews)
def dqfd_test(exp, agent): out_filename_format = '_imagetest/episode_{:0>4d}/{:s}/{:0>6d}' overall_reward = [] for i_episode in range(config.TEST_EPISODE): print("Environment reset...") exp.reset() state = None meas = None next_state = None next_meas = None offroad_list = [] otherlane_list = [] episode_reward = 0 # transition_queue = collections.deque(maxlen=config.TRAJECTORY_NUM) for steps in range(config.REPLAY_FRAME): print("Test episode: %d, frame: %d , length of replaymemory %d" % (i_episode, steps, len(agent.replay_memory))) action_no = agent.e_greedy_select_action(state) action = exp.reverse_action(action_no) next_meas, next_state, reward, done, _ = exp.step(action) next_state = utils.rgb_image_to_tensor(next_state['CameraRGB']) offroad_list.append(next_meas['offroad']) otherlane_list.append(next_meas['other_lane']) episode_reward += reward # reset the enviroment if the car stay offroad or other_lane for 5 consequent steps if len(offroad_list) > 10: ar = np.array(offroad_list[-5:]).astype('int64') tag1 = np.bitwise_and.reduce(ar) br = np.array(otherlane_list[-10:]).astype('int64') tag2 = np.bitwise_and.reduce(br) tag = tag1 | tag2 if tag: break print("Reset because of off road or intersection!") for name, images in exp.cur_image.items(): filename = out_filename_format.format(i_episode, name, steps) images.save_to_disk(filename) state = next_state meas = next_meas overall_reward.append(episode_reward / steps) utils.plot_reward(overall_reward) print("Episode finished!") print("Test finished!") return overall_reward
def show_agents(grids, agent_list, agent_names, grid_names, filename='AgentComparison', figtitle=''): """Shows how agents perform on a gridworld grid - list of gridworlds (see examples in earlier part of file) agent_list - list of agent (objects) agent_names - names of agents (strings) """ num_ex = len(agent_list) num_grids = len(grids) fig, axes_grid = plt.subplots(num_grids, num_ex, figsize=(14.0, 4.5)) if num_grids == 1: axes_grid = [axes_grid] for i, axes in enumerate(axes_grid): # Give each gridworld a name (uncomment to do so) # ax.set_ylabel(grid_names[i]) # Generate MDP grid = grids[i] mdp = GridworldMdp(grid, noise=0.2) walls, reward, start = mdp.convert_to_numpy_input() for idx, agent in enumerate(agent_list): ax = axes[idx] ax.set_aspect('equal') plot_reward(reward, walls, '', fig=fig, ax=ax) plot_trajectory(walls, reward, start, agent, arrow_width=0.35, fig=fig, ax=ax) # Only write Agent names if it's the first row if i == 0: ax.set_title(agent_names[idx], fontname='Times New Roman', fontsize=16) print('Agent {} is {}'.format(agent_names[idx], agent)) # Increase vertical space btwn subplots # fig.subplots_adjust(hspace=0.2) # fig.suptitle(figtitle) fig.savefig(filename, bbox_inches='tight', dpi=500) print("Saved figure to {}.png".format(filename))
def random_gridworld_plot(agent, other_agent, size, filename='RandomGrid'): """Plots random gridworld""" from gridworld.gridworld import Direction from utils import Distribution if agent is None: raise ValueError("agent cannot be None") num_R = 5 mdp = GridworldMdp.generate_random_connected(size, size, num_R, noise=0) walls, reward, start = mdp.convert_to_numpy_input() def get_policy(agent): num_actions = 5 imsize = len(walls) def dist_to_numpy(dist): return dist.as_numpy_array(Direction.get_number_from_direction, num_actions) def action(state): # Walls are invalid states and the MDP will refuse to give an action for # them. However, the VIN's architecture requires it to provide an action # distribution for walls too, so hardcode it to always be STAY. x, y = state if mdp.walls[y][x]: return dist_to_numpy(Distribution({Direction.STAY: 1})) return dist_to_numpy(agent.get_action_distribution(state)) agent.set_mdp(mdp) action_dists = [[action((x, y)) for x in range(imsize)] for y in range(imsize)] action_dists = np.array(action_dists) return action_dists fig, axes = plt.subplots(1, 1) fig.set_size_inches(5, 5) # Reward only plot_reward(reward, np.zeros_like(walls), fig=fig, ax=axes, ax_title='') fig.savefig(filename + 'R', bbox_inches='tight', dpi=100) # Walls only plot_reward(np.zeros_like(reward), walls, fig=fig, ax=axes, ax_title='') fig.savefig(filename + 'W', bbox_inches='tight', dpi=100) # Trajectory + Walls + Rewards plot_reward(reward, walls, fig=fig, ax=axes, ax_title='') # plot_trajectory(walls, reward, start, agent, fig=fig, ax=axes) policy = get_policy(agent) plot_policy(walls, policy, fig=fig, ax=axes) fig.savefig(filename + 'Ptrue', bbox_inches='tight', dpi=100) axes.clear() plot_reward(reward, walls, fig=fig, ax=axes, ax_title='') predicted = get_policy(other_agent) plot_policy_diff(predicted, policy, walls, fig=fig, ax=axes) fig.savefig(filename + 'Ppredicted', bbox_inches='tight', dpi=100)
accumulated_reward += state.reward accumulated_reward_list.append(accumulated_reward) if opt.disp_on: if win_all is None: plt.subplot(121) win_all = plt.imshow(state.screen) plt.subplot(122) win_pob = plt.imshow(state.pob) else: win_all.set_data(state.screen) win_pob.set_data(state.pob) plt.pause(opt.disp_interval) plt.draw() epi_step = epi_step + 1 if step % 100: print("Step {}: Accumulated reward= {}".format(step, accumulated_reward)) elapsed_time = time() - start_time print("----- Finished testing -----") print("Completed {} games, reached targt {} times".format( opt.steps, n_completed_episodes)) print("{}% of games the target was reached".format(n_completed_episodes / opt.steps)) print("Total time : {:.2f} seconds".format(elapsed_time)) print("Accumulated reward: ", accumulated_reward_list[-1]) plot_reward(accumulated_reward_list, opt.acc_reward_test_figure_path)
test(env, likelihood) env.adapt_a() likelihood.reinitialize_optimizer(lr=1e-2) l_a2b, l_b2a = control.train_with_buffer( env, likelihood, hparam['nb_episode_adapt']) likelihood_a2b_adapt[run, :] = l_a2b likelihood_b2a_adapt[run, :] = l_b2a utils.plot_training(likelihood_a2b, likelihood_b2a, hparam['output'], True) utils.plot_adaptation(likelihood_a2b_adapt, likelihood_b2a_adapt, hparam['output'], True) if rl_mode: reward_a2b = np.cumsum(reward_a2b, axis=1) reward_b2a = np.cumsum(reward_b2a, axis=1) reward_a2b_adapt = np.cumsum(reward_a2b_adapt, axis=1) reward_b2a_adapt = np.cumsum(reward_b2a_adapt, axis=1) utils.plot_reward(reward_a2b, reward_b2a, hparam['output'], True) utils.plot_reward_adapt(reward_a2b_adapt, reward_b2a_adapt, hparam['output'], True) def test(env, likelihood): env.compare_directions(likelihood) env.compare_likelihood(likelihood, 0, 0, 0, 0, 0) env.compare_likelihood(likelihood, 1, 0, 0, 0, 0) env.compare_likelihood(likelihood, 2, 0, 0, 0, 0) env.compare_likelihood(likelihood, 3, 0, 0, 0, 0) __import__('ipdb').set_trace()
def run_game(self): config = self.config n = config.runs_per_agent prev_best_reward = -1000 for run in range(n): # potentially, we can change the goals as agent picks up more skills env = eval(config.environment) test_env = eval(config.environment) cLoss, aLoss = [], [] # 0. instantiate an agent instance of this class agent = self.agentCls(**self.agentArgs) obs_dim, act_dim = agent.obs_dim, agent.act_dim # 1. instantiate a memory pool and warm up rpm = ReplayMemory(config.memory_size, obs_dim, act_dim) # 2. set up logging file save_dir = config.log_path + "{}_{}".format(self.name, run + 1) if not os.path.exists(save_dir): os.makedirs(save_dir) # 3. start training test_flag, total_steps = 0, 0 train_rewards, test_means, test_stds = [], [], [] pbar = tqdm(total=config.train_total_steps) while total_steps < config.train_total_steps: para = [ config.reward_scale, config.warmup_size, config.batch_size, config.expl_noise ] train_reward, steps, costC, costA = run_train_episode( env, agent, rpm, *para) total_steps += steps train_rewards.append(train_reward) cLoss.append(costC) aLoss.append(costA) pbar.set_description('Steps: {} Reward: {}'.format( total_steps, train_reward)) pbar.update(steps) # 4. start testing if total_steps // config.test_every_steps >= test_flag: while total_steps // config.test_every_steps >= test_flag: test_flag += 1 r_mean, r_std = run_evaluate_episode(test_env, agent) logger.info('Steps {}, Evaluate reward: {}'.format( total_steps, r_mean)) test_means.append(r_mean) test_stds.append(r_std) if config.save_model and r_mean > prev_best_reward: prev_best_reward = r_mean ckpt = save_dir + '/Steps_{}_reward_{}.ckpt'.format( total_steps, int(r_mean)) agent.save(ckpt, program=agent.pred_program) np.savez(save_dir + '/record.npz', train=train_rewards, mean=test_means, std=test_stds, closs=cLoss, aloss=aLoss) if config.visual_result: plot_reward(train_rewards) plot_reward(test_means, test_stds)
'-------------------------------------------------' ) subtask_list.append(subtask_index) subtask_reward.append(acc_reward) task_r += acc_reward if render: task[subtask_index].close() task_reward.append(task_r) time_task = time.time() - time_task_0 if task_epi % 1 == 0: # record the reward plot_reward(subtask_list, subtask_reward, name=model.name + '_' + 'subtask_reward_CartPole_SwingUp_' + str(task_epi), xlabel='subtask', y=195) plot_reward(range(len(task_reward)), task_reward, name=model.name + '_' + 'task_reward_CartPole_SwingUp_' + str(task_epi), xlabel='episode', y=1560, scatter=False) print('***************************') print('task_episode: ', task_epi, ' time: ', time_task) if model.name == 'DPGPMM': numbers = [] for comp in model.DP_mix.comps:
'optimizer_actor_state_dict': agent.optimizer_actor.state_dict(), 'optimizer_critic_state_dict': agent.optimizer_actor.state_dict() }, 'agent_state_dict.pt') if np.mean(scores_window) >= 32: print("\n Problem Solved!") break print("Score: {}".format(score)) #Plotting Rewards plot_reward(scores) ############################## TESTING AGENT ######################################################## ##################################################################################################### # Loading saved parameters to test agent over 100 trials checkpoint = torch.load('agent_state_dict.pt', map_location="cpu") agent.actor_main.load_state_dict(checkpoint['actor_main_network_state_dict']) agent.critic_main.load_state_dict(checkpoint['critic_main_network_state_dict']) def test(num_episodes=100): all_scores = [] from tqdm import tqdm
config = init_birl_flags() if config.datafile is None: print('--datafile option is required') exit() # seed random generators set_seeds(config.seed) imagetest, rewardtest, ytest = load_dataset(config.datafile)[-3:] for image, reward, policy in zip(imagetest, rewardtest, ytest): mdp = GridworldMdp.from_numpy_input(image, reward) mdp = GridworldMdpLearnableR.from_full_mdp(mdp) inferred_reward = birl(mdp, policy, config.beta, num_burn_in=config.num_burn_in, num_samples=config.num_samples, display_step=config.display_step) print('The first set of walls is:') print(image) print('The first reward should be:') print(reward) inferred_reward = inferred_reward / inferred_reward.max() inferred_reward = np.reshape(inferred_reward, image.shape) print('The inferred reward is:') print(inferred_reward) plot_reward(reward, inferred_reward) break
def dqfd_replay(exp, agent): overall_reward = [] for i_episode in range(config.REPLAY_EPISODE): print("Environment reset...") exp.reset() state = None meas = None next_state = None next_meas = None offroad_list = [] otherlane_list = [] episode_reward = 0 # transition_queue = collections.deque(maxlen=config.TRAJECTORY_NUM) for steps in itertools.count(config.DEMO_BUFFER_SIZE): frame_no = steps - config.DEMO_BUFFER_SIZE print("Replay episode: %d, frame: %d , length of replaymemory %d" % (i_episode, frame_no, len(agent.replay_memory))) action_no = agent.e_greedy_select_action(state) action = exp.reverse_action(action_no) next_meas, next_state, reward, done, _ = exp.step(action) next_state = utils.rgb_image_to_tensor(next_state['CameraRGB']) offroad_list.append(next_meas['offroad']) otherlane_list.append(next_meas['other_lane']) episode_reward += reward # reset the enviroment if the car stay offroad or other_lane for 5 consequent steps if len(offroad_list) > 10: ar = np.array(offroad_list[-5:]).astype('int64') tag1 = np.bitwise_and.reduce(ar) br = np.array(otherlane_list[-10:]).astype('int64') tag2 = np.bitwise_and.reduce(br) tag = tag1 | tag2 if tag: exp.reset() if meas: transition = Transition( meas, state, torch.tensor([[action_no]]), torch.tensor([[reward]]), next_state, next_meas, torch.zeros(1) ) # TODO: use both the measurement and the image later agent.replay_memory_push([transition]) state = next_state meas = next_meas if agent.replay_memory.is_full: print("Trainning!") agent.train() # # if done: # print("episode: %d, memory length: %d epsilon: %f" % (i_episode, len(agent.replay_memory), agent.epsilon)) # break if steps % 100 == 0: agent.update_target_net() if frame_no >= config.REPLAY_FRAME: overall_reward.append(episode_reward / config.REPLAY_FRAME) utils.plot_reward(overall_reward) print("Episode finished!") break #save the result every 20 episode if i_episode % 20 == 0: print("Saving prameters for the last 20 episodes") reward_df = pd.DataFrame(overall_reward) reward_df.to_csv('_episode_reward%d.csv' % i_episode) with open(config.CARLA_TRAIN_FILE, 'wb') as f: pickle.dump(agent, f) print("Trained parameters achevied!") print("Replay finished!") return overall_reward