def create_agent(agent, gamma, beta, num_iters, max_delay, hyperbolic_constant, calibration): """Creates the agent specified in config.""" if agent == 'optimal': return fast_agents.FastOptimalAgent(gamma=gamma, beta=beta, num_iters=num_iters) elif agent == 'naive': return fast_agents.FastNaiveTimeDiscountingAgent(max_delay, hyperbolic_constant, gamma=gamma, beta=beta, num_iters=num_iters) elif agent == 'sophisticated': return fast_agents.FastSophisticatedTimeDiscountingAgent( max_delay, hyperbolic_constant, gamma=gamma, beta=beta, num_iters=num_iters) elif agent == 'myopic': return fast_agents.FastMyopicAgent(max_delay, gamma=gamma, beta=beta, num_iters=num_iters) elif agent in ['overconfident', 'underconfident']: assert calibration > 1 if agent == 'overconfident' else calibration < 1 return fast_agents.FastUncalibratedAgent( gamma=gamma, beta=beta, num_iters=num_iters, calibration_factor=calibration) raise ValueError('Invalid agent: ' + agent)
def infer_with_rational_planner(config, beta=None): if config.verbosity >= 2: print('Using a rational planner with beta {} to mimic normal IRL'.format(beta)) architecture = PlannerArchitecture(config) agent, other_agents = create_agents_from_config(config) num_without_reward = make_evenly_batched(config.num_human_trajectories, config) num_simulated, num_validation = config.num_simulated, config.num_validation optimal_agent = fast_agents.FastOptimalAgent( gamma=config.gamma, beta=beta, num_iters=config.num_iters) train_data, validation_data = generate_data_for_planner( num_simulated, num_validation, optimal_agent, config, other_agents) reward_data = generate_data_for_reward( num_without_reward, agent, config, other_agents) return run_inference(train_data, validation_data, reward_data, two_phase_algorithm, architecture, config)
def infer_with_no_rewards(config, train_jointly, initialize): if config.verbosity >= 2: s1 = 'jointly' if train_jointly else 'iteratively' s2 = 'with' if initialize else 'without' print('No rewards given, training planner and reward {} {} initialization'.format(s1, s2)) architecture = PlannerArchitecture(config) agent, other_agents = create_agents_from_config(config) num_simulated, num_validation = config.num_simulated, config.num_validation num_without_reward = make_evenly_batched(config.num_human_trajectories, config) reward_data = generate_data_for_reward( num_without_reward, agent, config, other_agents) alg = joint_algorithm if train_jointly else iterative_algorithm train_data, validation_data = None, None if initialize: optimal_agent = fast_agents.FastOptimalAgent( gamma=config.gamma, beta=config.beta, num_iters=config.num_iters) train_data, validation_data = generate_data_for_planner( num_simulated, num_validation, optimal_agent, config, other_agents) return run_inference(train_data, validation_data, reward_data, alg, architecture, config)
def main(): grid = [['X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X'], ['X', ' ', -90, -90, -90, -90, '8', ' ', 'X'], ['X', 'A', ' ', ' ', ' ', ' ', ' ', ' ', 'X'], ['X', ' ', ' ', ' ', ' ', ' ', ' ', ' ', 'X'], ['X', ' ', ' ', -99, '2', ' ', ' ', ' ', 'X'], ['X', ' ', ' ', ' ', ' ', ' ', ' ', ' ', 'X'], ['X', ' ', '1', ' ', ' ', ' ', ' ', ' ', 'X'], ['X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X']] mdp = GridworldMdp(grid, living_reward=-0.01, noise=0.2) env = Mdp(mdp) opt = fast_agents.FastOptimalAgent(gamma=0.95, num_iters=20) naive = NaiveTimeDiscountingAgent(10, 1, gamma=0.95, num_iters=20) soph = fast_agents.FastSophisticatedTimeDiscountingAgent(10, 1, gamma=0.95, num_iters=20) myopic = fast_agents.FastMyopicAgent(6, gamma=0.95, num_iters=20) over = fast_agents.FastUncalibratedAgent(gamma=0.95, num_iters=20, calibration_factor=5) under = fast_agents.FastUncalibratedAgent(gamma=0.95, num_iters=20, calibration_factor=0.5) agents = [opt, naive, soph, myopic, over, under] names = [ 'Optimal', 'Naive', 'Sophisticated', 'Myopic', 'Overconfident', 'Underconfident' ] for name, agent in zip(names, agents): print('{} agent'.format(name)) agent.set_mdp(mdp) trajectory = run_agent(agent, env, episode_length=50, determinism=True) if agent == naive: print([a for _, a, _, _ in trajectory]) print_training_example(mdp, trajectory) print(opt.values.T)
def test_gridworld_optimal_agent(self): agent = fast_agents.FastOptimalAgent(gamma=0.95, num_iters=20) self.optimal_agent_test(agent)
def test_compare_optimal_agents(self): agent1 = agents.OptimalAgent(gamma=0.95, num_iters=20) agent2 = fast_agents.FastOptimalAgent(gamma=0.95, num_iters=20) self.compare_agents('optimal', agent1, agent2, print_mdp=True)