def worker(runs, env_cfg, task, param): environment = Maze(**env_cfg) agent_program = AGENT_PROGRAM[task[0]](environment.act_spec, environment.obs_spec, **task[1]) agent = core.Agent(agent_program, lambda env: (env.STATE_IDX[env.obs], env.reward), lambda action, env: env.step(action)) steps = [] for _ in range(0, task[2]['episodes']): core.Run(agent, environment).start() steps.append(environment.nstep) agent_program.reset() environment.reset() key = { **env_cfg, **{ 'runs': runs }, **{ 'alg': task[0] }, **task[1], **task[2] } SharedMem.dump(Testbed.key_for('episodes', **key), param['run'], steps)
def worker(runs, env_cfg, task, param): environment = Maze(**env_cfg) agent_program = AGENT_PROGRAM[task[0]](environment.act_spec, environment.obs_spec, **task[1]) agent = core.Agent(agent_program, lambda env: (env.STATE_IDX[env.obs], env.reward), lambda action, env: env.step(action)) while environment.steps_cnt < task[2]['steps']: core.Run(agent, environment).start() agent_program.reset() environment.reset() key = { **env_cfg, **{ 'runs': runs }, **{ 'alg': task[0] }, **task[1], **task[2] } SharedMem.dump(Testbed.key_for('rewards', **key), param['run'], environment.rewards[0:task[2]['steps']])
def test_runs_in_mountain_car_environment(): environment = MountainCar() agent_program = SemiGradientSarsa( environment.act_spec, environment.obs_spec, alpha=0.5, epsilon=0.0, gamma=1.0) agent = core.Agent( agent_program, lambda env: (env.obs, env.reward, env.done()), lambda action, env: env.step(action)) core.Run(agent, environment).start()
def test_true_sarsa_lambda_runs_in_mountain_car_environment(): environment = MountainCar() agent_program = TrueOnlineSarsaLambda( environment.act_spec, environment.obs_spec, alpha=0.2, epsilon=0.0, gamma=1.0, lmbda=0.9) agent = core.Agent( agent_program, lambda env: (env.obs, env.reward, env.done()), lambda action, env: env.step(action)) core.Run(agent, environment).start()
def test_collects_reward_from_each_step(self): arms = 10 eps = 0.1 environment = NArmedBanditEnv(10, arms) agent = core.Agent( epsilongreedy.SampleAverage( act_spec=core.Spec([core.Space(shape=(10, ))]), obs_spec=core.Spec([core.Space(shape=(10, ))]), epsilon=eps), lambda env: (env.last_action, env.reward), lambda action, env: env.step(action)) core.Run(agent, environment).start() assert len(environment.all_rewards) == 10 assert len(environment.optimal_actions) == 10
def test_nstep_sarsa(): environment = WindyGridWorld() agent_program = OnPolicyNStepSarsa( environment.act_spec, environment.obs_spec, n=2, alpha=0.5, epsilon=0.1, gamma=1.0) agent = core.Agent( agent_program, lambda env: (env.STATE_IDX[env.obs], env.reward, env.done()), lambda action, env: env.step(action)) core.Run(agent, environment).start()
def test_actor_critic_runs_in_mountain_car_environment(): environment = MountainCar() agent_program = ActorCriticLambda( environment.act_spec, environment.obs_spec, alpha_w=0.1, alpha_theta=0.01, gamma=1.0, lambda_w=0.9, lambda_theta=0.9) agent = core.Agent( agent_program, lambda env: (env.obs, env.reward, env.done()), lambda action, env: env.step(action)) core.Run(agent, environment).start()
def worker(runs, env_cfg, task, param): environment = NArmedBanditEnv(**env_cfg) agent = core.Agent( AGENT_PROGRAM[task[0]](environment.act_spec, environment.obs_spec, **task[1]), lambda env: (env.last_action, env.reward), lambda action, env: env.step(action)) core.Run(agent, environment).start() key = {**env_cfg, **{'runs': runs}, **{'alg': task[0]}, **task[1]} SharedMem.dump(Testbed.key_for('rewards', **key), param['run'], environment.all_rewards) SharedMem.dump(Testbed.key_for('actions', **key), param['run'], environment.optimal_actions)
def worker(runs, env_cfg, task, param): environment = RaceTrack(**env_cfg) agent = core.Agent( AGENT_PROGRAM[task[0]]( environment.act_spec, environment.obs_spec, **task[1]), lambda env: (env.obs, -1), lambda action, env: env.step(action)) core.Run(agent, environment).start() key = {**env_cfg, **{'runs': runs}, **{'alg': task[0]}, **task[1]} states, actions, rewards = environment.episode() run = param['run'] SharedMem.dump(Testbed.key_for('states', **key), run, states) SharedMem.dump(Testbed.key_for('rewards', **key), run, rewards) SharedMem.dump(Testbed.key_for('actions', **key), run, actions)
def run(self): environment = WindyGridWorld(stochastic=self.stochastic) agent_program = OnPolicyNStepSarsa(environment.act_spec, environment.obs_spec, n=self.n, alpha=self.alpha, epsilon=self.epsilon, gamma=self.gamma) agent = core.Agent( agent_program, lambda env: (env.STATE_IDX[env.obs], env.reward, env.done()), lambda action, env: env.step(action)) for _ in tqdm.tqdm(range(0, self.runs)): core.Run(agent, environment).start() self.steps.append(environment.nstep) agent_program.reset() environment.reset()