def test_run(self): # Set up environment and policy env = gym.make('MountainCar-v0') pol = vcf.DiscreteRandomControl(env.action_space.n) # Define an agent phi_1 = vcf.BiasUnit() td_1 = vcf.TD(len(phi_1)) params_1 = { 'alpha': 0.01, 'gm': 0.999, 'gm_p': 0.999, 'lm': 0.1, } agent_1 = vcf.Agent(td_1, phi_1, params_1) agents = [agent_1] # Set up the experiment experiment = vcf.LiveExperiment(env, pol, agents) # Try running the experiment num_eps = 10 max_steps = 10 experiment.run(num_eps, max_steps, callbacks=[])
def test_run_with_callbacks(self): # Set up environment and policy env = gym.make('MountainCar-v0') pol = vcf.DiscreteRandomControl(env.action_space.n) # Define an agent phi_1 = vcf.BiasUnit() td_1 = vcf.TD(len(phi_1)) params_1 = { 'alpha': 0.01, 'gm': 0.999, 'gm_p': 0.999, 'lm': 0.1, } agent_1 = vcf.Agent(td_1, phi_1, params_1) agents = [agent_1] # Set up the experiment experiment = vcf.LiveExperiment(env, pol, agents) # Set up testing callbacks cbk = _CheckCallback() # Try running the experiment num_eps = 10 max_steps = 10 experiment.run(num_eps, max_steps, callbacks=[cbk]) # Check that the callbacks ran properly assert (cbk.experiment_begin > 0) assert (cbk.experiment_end > 0) assert (cbk.episode_begin > 0) assert (cbk.episode_end > 0) assert (cbk.step_begin > 0) assert (cbk.step_end > 0)
def test_terminal_context(self): # Set up the agent param_funcs = { 'alpha': 0.05, 'gm': vcf.Constant(0.9999, 0), 'gm_p': vcf.Constant(0.9999, 0), 'lm': 0.1 } phi = vcf.BinaryVector(10) algo = vcf.TD(len(phi)) agent = vcf.Agent(algo, phi, param_funcs) # No base context base_ctx = {} term_ctx = agent.terminal_context(base_ctx) assert (isinstance(term_ctx, dict)) assert (term_ctx['done'] == True) assert (term_ctx['r'] == 0) assert (all(term_ctx['xp'] == 0)) # Nonsense base context (should still be present) base_ctx = {'__' + str(i): i**2 for i in range(10)} term_ctx = agent.terminal_context(base_ctx) assert (isinstance(term_ctx, dict)) assert (term_ctx['done'] == True) assert (term_ctx['r'] == 0) assert (all(term_ctx['xp'] == 0)) assert (all(key in term_ctx for key in base_ctx.keys())) assert (term_ctx[key] == val for key, val in base_ctx.items())
def test_setup(self): # Set up the agent param_funcs = { 'alpha': 0.05, 'gm': vcf.Constant(0.9999, 0), 'gm_p': vcf.Constant(0.9999, 0), 'lm': 0.1 } phi = vcf.BinaryVector(10) algo = vcf.TD(len(phi)) agent = vcf.Agent(algo, phi, param_funcs)
# Tile coding for discretization to binary vectors tiling_1 = vcf.features.BinaryTiling(env.observation_space, 11) tiling_2 = vcf.features.BinaryTiling(env.observation_space, 19) tiling_3 = vcf.features.BinaryTiling(env.observation_space, 31) # Concatenate binary vectors phi = vcf.Union(tiling_1, tiling_2, tiling_3) # Define the control (discrete actions Q-learning) dq = vcf.DiscreteQ(len(phi), na, epsilon=0.002) dq_params = { 'alpha' : vcf.parameters.EpisodicPowerLaw(0.2, 0.25), 'gm' : 0.9999, 'gm_p' : vcf.Constant(0.9999, 0), 'lm' : vcf.Constant(0.5, 0), } control = vcf.Agent(dq, phi, dq_params) # List of agents to update learners = [control] # Set up the experiment experiment = vcf.LiveExperiment(env, control, learners=learners) # Set up callbacks hist_cbk = vcf.callbacks.History() cbk_lst = [ vcf.callbacks.Progress(), hist_cbk, ] # Run the experiment experiment.run(150, 2000, callbacks=cbk_lst)
v_lm = kappa(ctx) v_lm_p = kappa_p(ctx) v_nxt = value_agent.get_value(ctx['obs_p']) # Compute next "reward" g_bar = ctx['r'] + v_gm_p * (1 - v_lm_p) * v_nxt r_bar = g_bar**2 + 2 * v_gm_p * v_lm_p * g_bar * v_nxt return r_bar # Set up storage for run data frames = [] # Perform multiple runs of the given number of episodes for run in range(num_runs): # Set up (or reset) the agents value_agent = vcf.Agent(vcf.algos.TD(num_states), phi, value_params) direct_agent = vcf.Agent(vcf.algos.TD(num_states), phi, direct_params, reward_func=direct_reward) second_agent = vcf.Agent(vcf.algos.TD(num_states), phi, second_params, reward_func=second_moment_reward) # Set up the experiment learners = [direct_agent, second_agent, value_agent] experiment = vcf.LiveExperiment(env, control, learners=learners) # Set up callbacks to record runs exclusions = ['x', 'xp']
tiling_2 = vcf.features.BinaryTiling(env.observation_space, 19) bias = vcf.features.BiasUnit() # Concatenate binary vectors phi = vcf.Union(bias, tiling_1, tiling_2) # Parameters for the agent td_params = { 'alpha': vcf.parameters.EpisodicPowerLaw(0.15, 0.5), 'gm': vcf.Constant(0.999, 0), 'gm_p': vcf.Constant(0.999, 0), 'lm': vcf.Constant(0.1, 0), } # Specify the algorithm algo = vcf.algos.TD(len(phi)) # Combine into agent agent = vcf.Agent(algo, phi, td_params) # List of agents to update learners = [agent] # Set up the experiment experiment = vcf.LiveExperiment(env, control, learners=learners) # Set up callbacks hist_cbk = vcf.callbacks.History() cbk_lst = [ vcf.callbacks.Progress(), hist_cbk, ] # Initialize via grid-search
if __name__ == "__main__" and True: import gym env = gym.make('SimpleMDP-v0') ns = env.observation_space.n na = env.action_space.n q_params = { 'alpha': vcf.Constant(0.01), 'gm': vcf.Constant(0.999, 0), 'gm_p': vcf.Constant(0.999, 0), 'lm': vcf.Constant(0.01, 0), } q_phi = vcf.BinaryVector(ns) q_algo = vcf.DiscreteQ(len(q_phi), na, epsilon=0.05) control = vcf.Agent(q_algo, q_phi, q_params) # Define some other agents that simply learn the value function phi1 = vcf.BinaryVector(ns) td_params = { 'alpha': vcf.Constant(0.01), 'gm': vcf.Constant(0.999, 0), 'gm_p': vcf.Constant(0.999, 0), 'lm': vcf.Constant(0.01, 0), } td_agent1 = vcf.Agent(vcf.TD(len(phi1)), phi1, td_params) phi2 = vcf.BiasUnit() td_params2 = { 'alpha': 0.01, 'gm': 0.9,