def testAgentInputPostprocessingEnabled(self): self.writeOutputs(self.test_dir) # Rewrite the files to drop advantages and value_targets for testing for path in glob.glob(self.test_dir + "/*.json"): out = [] for line in open(path).readlines(): data = json.loads(line) del data["advantages"] del data["value_targets"] out.append(data) with open(path, "w") as f: for data in out: f.write(json.dumps(data)) agent = PGAgent( env="CartPole-v0", config={ "input": self.test_dir, "input_evaluation": [], "postprocess_inputs": True, # adds back 'advantages' }) result = agent.train() self.assertEqual(result["timesteps_total"], 250) # read from input self.assertTrue(np.isnan(result["episode_reward_mean"]))
def testAgentInputPostprocessingEnabled(self): self.writeOutputs(self.test_dir) # Rewrite the files to drop advantages and value_targets for testing for path in glob.glob(self.test_dir + "/*.json"): out = [] for line in open(path).readlines(): data = json.loads(line) del data["advantages"] del data["value_targets"] out.append(data) with open(path, "w") as f: for data in out: f.write(json.dumps(data)) agent = PGAgent( env="CartPole-v0", config={ "input": self.test_dir, "input_evaluation": None, "postprocess_inputs": True, # adds back 'advantages' }) result = agent.train() self.assertEqual(result["timesteps_total"], 250) # read from input self.assertTrue(np.isnan(result["episode_reward_mean"]))
def doTestNestedTuple(self, make_env): ModelCatalog.register_custom_model("composite2", TupleSpyModel) register_env("nested2", make_env) pg = PGAgent(env="nested2", config={ "num_workers": 0, "sample_batch_size": 5, "train_batch_size": 5, "model": { "custom_model": "composite2", }, }) pg.train() # Check that the model sees the correct reconstructed observations for i in range(4): seen = pickle.loads( ray.experimental.internal_kv._internal_kv_get( "t_spy_in_{}".format(i))) pos_i = TUPLE_SAMPLES[i][0].tolist() cam_i = TUPLE_SAMPLES[i][1][0].tolist() task_i = one_hot(TUPLE_SAMPLES[i][2], 5) self.assertEqual(seen[0][0].tolist(), pos_i) self.assertEqual(seen[1][0].tolist(), cam_i) self.assertEqual(seen[2][0].tolist(), task_i)
def doTestNestedTuple(self, make_env): ModelCatalog.register_custom_model("composite2", TupleSpyModel) register_env("nested2", make_env) pg = PGAgent( env="nested2", config={ "num_workers": 0, "sample_batch_size": 5, "train_batch_size": 5, "model": { "custom_model": "composite2", }, }) pg.train() # Check that the model sees the correct reconstructed observations for i in range(4): seen = pickle.loads( ray.experimental.internal_kv._internal_kv_get( "t_spy_in_{}".format(i))) pos_i = TUPLE_SAMPLES[i][0].tolist() cam_i = TUPLE_SAMPLES[i][1][0].tolist() task_i = one_hot(TUPLE_SAMPLES[i][2], 5) self.assertEqual(seen[0][0].tolist(), pos_i) self.assertEqual(seen[1][0].tolist(), cam_i) self.assertEqual(seen[2][0].tolist(), task_i)
def doTestNestedDict(self, make_env, test_lstm=False): ModelCatalog.register_custom_model("composite", DictSpyModel) register_env("nested", make_env) pg = PGAgent( env="nested", config={ "num_workers": 0, "sample_batch_size": 5, "train_batch_size": 5, "model": { "custom_model": "composite", "use_lstm": test_lstm, }, }) pg.train() # Check that the model sees the correct reconstructed observations for i in range(4): seen = pickle.loads( ray.experimental.internal_kv._internal_kv_get( "d_spy_in_{}".format(i))) pos_i = DICT_SAMPLES[i]["sensors"]["position"].tolist() cam_i = DICT_SAMPLES[i]["sensors"]["front_cam"][0].tolist() task_i = one_hot( DICT_SAMPLES[i]["inner_state"]["job_status"]["task"], 5) self.assertEqual(seen[0][0].tolist(), pos_i) self.assertEqual(seen[1][0].tolist(), cam_i) self.assertEqual(seen[2][0].tolist(), task_i)
def doTestNestedDict(self, make_env, test_lstm=False): ModelCatalog.register_custom_model("composite", DictSpyModel) register_env("nested", make_env) pg = PGAgent(env="nested", config={ "num_workers": 0, "sample_batch_size": 5, "model": { "custom_model": "composite", "use_lstm": test_lstm, }, }) pg.train() # Check that the model sees the correct reconstructed observations for i in range(4): seen = pickle.loads( ray.experimental.internal_kv._internal_kv_get( "d_spy_in_{}".format(i))) pos_i = DICT_SAMPLES[i]["sensors"]["position"].tolist() cam_i = DICT_SAMPLES[i]["sensors"]["front_cam"][0].tolist() task_i = one_hot( DICT_SAMPLES[i]["inner_state"]["job_status"]["task"], 5) self.assertEqual(seen[0][0].tolist(), pos_i) self.assertEqual(seen[1][0].tolist(), cam_i) self.assertEqual(seen[2][0].tolist(), task_i)
def writeOutputs(self, output): agent = PGAgent(env="CartPole-v0", config={ "output": output, "sample_batch_size": 250, }) agent.train() return agent
def writeOutputs(self, output): agent = PGAgent( env="CartPole-v0", config={ "output": output, "sample_batch_size": 250, }) agent.train() return agent
def testAgentInputDir(self): self.writeOutputs(self.test_dir) agent = PGAgent(env="CartPole-v0", config={ "input": self.test_dir, "input_evaluation": None, }) result = agent.train() self.assertEqual(result["timesteps_total"], 250) # read from input self.assertTrue(np.isnan(result["episode_reward_mean"]))
def testTrainCartpole(self): register_env("test", lambda _: SimpleServing(gym.make("CartPole-v0"))) pg = PGAgent(env="test", config={"num_workers": 0}) for i in range(100): result = pg.train() print("Iteration {}, reward {}, timesteps {}".format( i, result["episode_reward_mean"], result["timesteps_total"])) if result["episode_reward_mean"] >= 100: return raise Exception("failed to improve reward")
def testMultiAgentComplexSpaces(self): ModelCatalog.register_custom_model("dict_spy", DictSpyModel) ModelCatalog.register_custom_model("tuple_spy", TupleSpyModel) register_env("nested_ma", lambda _: NestedMultiAgentEnv()) act_space = spaces.Discrete(2) pg = PGAgent(env="nested_ma", config={ "num_workers": 0, "sample_batch_size": 5, "train_batch_size": 5, "multiagent": { "policy_graphs": { "tuple_policy": (PGPolicyGraph, TUPLE_SPACE, act_space, { "model": { "custom_model": "tuple_spy" } }), "dict_policy": (PGPolicyGraph, DICT_SPACE, act_space, { "model": { "custom_model": "dict_spy" } }), }, "policy_mapping_fn": lambda a: { "tuple_agent": "tuple_policy", "dict_agent": "dict_policy" }[a], }, }) pg.train() for i in range(4): seen = pickle.loads( ray.experimental.internal_kv._internal_kv_get( "d_spy_in_{}".format(i))) pos_i = DICT_SAMPLES[i]["sensors"]["position"].tolist() cam_i = DICT_SAMPLES[i]["sensors"]["front_cam"][0].tolist() task_i = one_hot( DICT_SAMPLES[i]["inner_state"]["job_status"]["task"], 5) self.assertEqual(seen[0][0].tolist(), pos_i) self.assertEqual(seen[1][0].tolist(), cam_i) self.assertEqual(seen[2][0].tolist(), task_i) for i in range(4): seen = pickle.loads( ray.experimental.internal_kv._internal_kv_get( "t_spy_in_{}".format(i))) pos_i = TUPLE_SAMPLES[i][0].tolist() cam_i = TUPLE_SAMPLES[i][1][0].tolist() task_i = one_hot(TUPLE_SAMPLES[i][2], 5) self.assertEqual(seen[0][0].tolist(), pos_i) self.assertEqual(seen[1][0].tolist(), cam_i) self.assertEqual(seen[2][0].tolist(), task_i)
def testAgentInputList(self): self.writeOutputs(self.test_dir) agent = PGAgent(env="CartPole-v0", config={ "input": glob.glob(self.test_dir + "/*.json"), "input_evaluation": None, "sample_batch_size": 99, }) result = agent.train() self.assertEqual(result["timesteps_total"], 250) # read from input self.assertTrue(np.isnan(result["episode_reward_mean"]))
def testTrainMultiCartpoleSinglePolicy(self): n = 10 register_env("multi_cartpole", lambda _: MultiCartpole(n)) pg = PGAgent(env="multi_cartpole", config={"num_workers": 0}) for i in range(100): result = pg.train() print("Iteration {}, reward {}, timesteps {}".format( i, result["episode_reward_mean"], result["timesteps_total"])) if result["episode_reward_mean"] >= 50 * n: return raise Exception("failed to improve reward")
def testAgentInputDir(self): self.writeOutputs(self.test_dir) agent = PGAgent( env="CartPole-v0", config={ "input": self.test_dir, "input_evaluation": [], }) result = agent.train() self.assertEqual(result["timesteps_total"], 250) # read from input self.assertTrue(np.isnan(result["episode_reward_mean"]))
def testAgentInputList(self): self.writeOutputs(self.test_dir) agent = PGAgent( env="CartPole-v0", config={ "input": glob.glob(self.test_dir + "/*.json"), "input_evaluation": [], "sample_batch_size": 99, }) result = agent.train() self.assertEqual(result["timesteps_total"], 250) # read from input self.assertTrue(np.isnan(result["episode_reward_mean"]))
def testAgentInputDict(self): self.writeOutputs(self.test_dir) agent = PGAgent(env="CartPole-v0", config={ "input": { self.test_dir: 0.1, "sampler": 0.9, }, "train_batch_size": 2000, "input_evaluation": None, }) result = agent.train() self.assertTrue(not np.isnan(result["episode_reward_mean"]))
def testAgentInputEvalSim(self): self.writeOutputs(self.test_dir) agent = PGAgent(env="CartPole-v0", config={ "input": self.test_dir, "input_evaluation": "simulation", }) for _ in range(50): result = agent.train() if not np.isnan(result["episode_reward_mean"]): return # simulation ok time.sleep(0.1) assert False, "did not see any simulation results"
def testAgentInputDict(self): self.writeOutputs(self.test_dir) agent = PGAgent( env="CartPole-v0", config={ "input": { self.test_dir: 0.1, "sampler": 0.9, }, "train_batch_size": 2000, "input_evaluation": [], }) result = agent.train() self.assertTrue(not np.isnan(result["episode_reward_mean"]))
def testAgentInputEvalSim(self): self.writeOutputs(self.test_dir) agent = PGAgent( env="CartPole-v0", config={ "input": self.test_dir, "input_evaluation": ["simulation"], }) for _ in range(50): result = agent.train() if not np.isnan(result["episode_reward_mean"]): return # simulation ok time.sleep(0.1) assert False, "did not see any simulation results"
def testMultiAgentComplexSpaces(self): ModelCatalog.register_custom_model("dict_spy", DictSpyModel) ModelCatalog.register_custom_model("tuple_spy", TupleSpyModel) register_env("nested_ma", lambda _: NestedMultiAgentEnv()) act_space = spaces.Discrete(2) pg = PGAgent( env="nested_ma", config={ "num_workers": 0, "sample_batch_size": 5, "train_batch_size": 5, "multiagent": { "policy_graphs": { "tuple_policy": ( PGPolicyGraph, TUPLE_SPACE, act_space, {"model": {"custom_model": "tuple_spy"}}), "dict_policy": ( PGPolicyGraph, DICT_SPACE, act_space, {"model": {"custom_model": "dict_spy"}}), }, "policy_mapping_fn": lambda a: { "tuple_agent": "tuple_policy", "dict_agent": "dict_policy"}[a], }, }) pg.train() for i in range(4): seen = pickle.loads( ray.experimental.internal_kv._internal_kv_get( "d_spy_in_{}".format(i))) pos_i = DICT_SAMPLES[i]["sensors"]["position"].tolist() cam_i = DICT_SAMPLES[i]["sensors"]["front_cam"][0].tolist() task_i = one_hot( DICT_SAMPLES[i]["inner_state"]["job_status"]["task"], 5) self.assertEqual(seen[0][0].tolist(), pos_i) self.assertEqual(seen[1][0].tolist(), cam_i) self.assertEqual(seen[2][0].tolist(), task_i) for i in range(4): seen = pickle.loads( ray.experimental.internal_kv._internal_kv_get( "t_spy_in_{}".format(i))) pos_i = TUPLE_SAMPLES[i][0].tolist() cam_i = TUPLE_SAMPLES[i][1][0].tolist() task_i = one_hot(TUPLE_SAMPLES[i][2], 5) self.assertEqual(seen[0][0].tolist(), pos_i) self.assertEqual(seen[1][0].tolist(), cam_i) self.assertEqual(seen[2][0].tolist(), task_i)
def testInvalidModel(self): ModelCatalog.register_custom_model("invalid", InvalidModel) self.assertRaises(ValueError, lambda: PGAgent( env="CartPole-v0", config={ "model": { "custom_model": "invalid", }, }))
def testQueryEvaluators(self): register_env("test", lambda _: gym.make("CartPole-v0")) pg = PGAgent( env="test", config={"num_workers": 2, "sample_batch_size": 5}) results = pg.optimizer.foreach_evaluator(lambda ev: ev.batch_steps) results2 = pg.optimizer.foreach_evaluator_with_index( lambda ev, i: (i, ev.batch_steps)) self.assertEqual(results, [5, 5, 5]) self.assertEqual(results2, [(0, 5), (1, 5), (2, 5)])
def testMultiAgent(self): register_env("multi_cartpole", lambda _: MultiCartpole(10)) single_env = gym.make("CartPole-v0") def gen_policy(): obs_space = single_env.observation_space act_space = single_env.action_space return (PGPolicyGraph, obs_space, act_space, {}) pg = PGAgent( env="multi_cartpole", config={ "num_workers": 0, "output": self.test_dir, "multiagent": { "policy_graphs": { "policy_1": gen_policy(), "policy_2": gen_policy(), }, "policy_mapping_fn": (lambda agent_id: random.choice(["policy_1", "policy_2"])), }, }) pg.train() self.assertEqual(len(os.listdir(self.test_dir)), 1) pg.stop() pg = PGAgent( env="multi_cartpole", config={ "num_workers": 0, "input": self.test_dir, "input_evaluation": "simulation", "train_batch_size": 2000, "multiagent": { "policy_graphs": { "policy_1": gen_policy(), "policy_2": gen_policy(), }, "policy_mapping_fn": (lambda agent_id: random.choice(["policy_1", "policy_2"])), }, }) for _ in range(50): result = pg.train() if not np.isnan(result["episode_reward_mean"]): return # simulation ok time.sleep(0.1) assert False, "did not see any simulation results"
def testInvalidModel2(self): ModelCatalog.register_custom_model("invalid2", InvalidModel2) self.assertRaisesRegexp( ValueError, "Expected output.*", lambda: PGAgent(env="CartPole-v0", config={ "model": { "custom_model": "invalid2", }, }))
def testCallbacks(self): counts = Counter() pg = PGAgent(env="CartPole-v0", config={ "num_workers": 0, "sample_batch_size": 50, "callbacks": { "on_episode_start": lambda x: counts.update({"start": 1}), "on_episode_step": lambda x: counts.update({"step": 1}), "on_episode_end": lambda x: counts.update({"end": 1}), "on_sample_end": lambda x: counts.update({"sample": 1}), }, }) pg.train() pg.train() pg.train() pg.train() self.assertEqual(counts["sample"], 4) self.assertGreater(counts["start"], 0) self.assertGreater(counts["end"], 0) self.assertGreater(counts["step"], 200) self.assertLess(counts["step"], 400)
def testTrainMultiCartpoleMultiPolicy(self): n = 10 register_env("multi_cartpole", lambda _: MultiCartpole(n)) single_env = gym.make("CartPole-v0") def gen_policy(): config = { "gamma": random.choice([0.5, 0.8, 0.9, 0.95, 0.99]), "n_step": random.choice([1, 2, 3, 4, 5]), } obs_space = single_env.observation_space act_space = single_env.action_space return (PGPolicyGraph, obs_space, act_space, config) pg = PGAgent( env="multi_cartpole", config={ "num_workers": 0, "multiagent": { "policy_graphs": { "policy_1": gen_policy(), "policy_2": gen_policy(), }, "policy_mapping_fn": lambda agent_id: "policy_1", }, }) # Just check that it runs without crashing for i in range(10): result = pg.train() print("Iteration {}, reward {}, timesteps {}".format( i, result["episode_reward_mean"], result["timesteps_total"])) self.assertTrue( pg.compute_action([0, 0, 0, 0], policy_id="policy_1") in [0, 1]) self.assertTrue( pg.compute_action([0, 0, 0, 0], policy_id="policy_2") in [0, 1]) self.assertRaises( KeyError, lambda: pg.compute_action([0, 0, 0, 0], policy_id="policy_3"))
def testCallbacks(self): counts = Counter() pg = PGAgent( env="CartPole-v0", config={ "num_workers": 0, "sample_batch_size": 50, "train_batch_size": 50, "callbacks": { "on_episode_start": lambda x: counts.update({"start": 1}), "on_episode_step": lambda x: counts.update({"step": 1}), "on_episode_end": lambda x: counts.update({"end": 1}), "on_sample_end": lambda x: counts.update({"sample": 1}), }, }) pg.train() pg.train() pg.train() pg.train() self.assertEqual(counts["sample"], 4) self.assertGreater(counts["start"], 0) self.assertGreater(counts["end"], 0) self.assertGreater(counts["step"], 200) self.assertLess(counts["step"], 400)
def testMultiAgent(self): register_env("multi_cartpole", lambda _: MultiCartpole(10)) single_env = gym.make("CartPole-v0") def gen_policy(): obs_space = single_env.observation_space act_space = single_env.action_space return (PGPolicyGraph, obs_space, act_space, {}) pg = PGAgent( env="multi_cartpole", config={ "num_workers": 0, "output": self.test_dir, "multiagent": { "policy_graphs": { "policy_1": gen_policy(), "policy_2": gen_policy(), }, "policy_mapping_fn": ( lambda agent_id: random.choice( ["policy_1", "policy_2"])), }, }) pg.train() self.assertEqual(len(os.listdir(self.test_dir)), 1) pg.stop() pg = PGAgent( env="multi_cartpole", config={ "num_workers": 0, "input": self.test_dir, "input_evaluation": ["simulation"], "train_batch_size": 2000, "multiagent": { "policy_graphs": { "policy_1": gen_policy(), "policy_2": gen_policy(), }, "policy_mapping_fn": ( lambda agent_id: random.choice( ["policy_1", "policy_2"])), }, }) for _ in range(50): result = pg.train() if not np.isnan(result["episode_reward_mean"]): return # simulation ok time.sleep(0.1) assert False, "did not see any simulation results"
def testQueryEvaluators(self): register_env("test", lambda _: gym.make("CartPole-v0")) pg = PGAgent(env="test", config={ "num_workers": 2, "sample_batch_size": 5, "num_envs_per_worker": 2, }) results = pg.optimizer.foreach_evaluator( lambda ev: ev.sample_batch_size) results2 = pg.optimizer.foreach_evaluator_with_index( lambda ev, i: (i, ev.sample_batch_size)) results3 = pg.optimizer.foreach_evaluator( lambda ev: ev.foreach_env(lambda env: 1)) self.assertEqual(results, [10, 10, 10]) self.assertEqual(results2, [(0, 10), (1, 10), (2, 10)]) self.assertEqual(results3, [[1, 1], [1, 1], [1, 1]])
def testTrainMultiCartpoleMultiPolicy(self): n = 10 register_env("multi_cartpole", lambda _: MultiCartpole(n)) single_env = gym.make("CartPole-v0") def gen_policy(): config = { "gamma": random.choice([0.5, 0.8, 0.9, 0.95, 0.99]), "n_step": random.choice([1, 2, 3, 4, 5]), } obs_space = single_env.observation_space act_space = single_env.action_space return (None, obs_space, act_space, config) pg = PGAgent( env="multi_cartpole", config={ "num_workers": 0, "multiagent": { "policy_graphs": { "policy_1": gen_policy(), "policy_2": gen_policy(), }, "policy_mapping_fn": lambda agent_id: "policy_1", }, }) # Just check that it runs without crashing for i in range(10): result = pg.train() print("Iteration {}, reward {}, timesteps {}".format( i, result["episode_reward_mean"], result["timesteps_total"])) self.assertTrue( pg.compute_action([0, 0, 0, 0], policy_id="policy_1") in [0, 1]) self.assertTrue( pg.compute_action([0, 0, 0, 0], policy_id="policy_2") in [0, 1]) self.assertRaises( KeyError, lambda: pg.compute_action([0, 0, 0, 0], policy_id="policy_3"))
def testNoStepOnInit(self): register_env("fail", lambda _: FailOnStepEnv()) pg = PGAgent(env="fail", config={"num_workers": 1}) self.assertRaises(Exception, lambda: pg.train())
# initialize trainer - since all agent's use the same policy graph one trainer is fine here # otherwise we'd need one trainer per policy graph used for training trainer = PGAgent(env='Pomme_v0', config={ 'multiagent': { 'policy_graphs': { 'agent_0': (PGPolicyGraph, obs_space, act_space, { "gamma": 0.85 }), 'agent_1': (PGPolicyGraph, obs_space, act_space, { "gamma": 0.90 }), 'agent_2': (PGPolicyGraph, obs_space, act_space, { "gamma": 0.95 }), 'agent_3': (PGPolicyGraph, obs_space, act_space, { "gamma": 0.99 }), }, 'policy_mapping_fn': lambda agent_id: agent_id }, 'model': { 'custom_preprocessor': 'Featurize_Preprocessor' }, 'env_config': env_config }) print('\nTrainer Config:\n', trainer.config, '\n')
def testRolloutDictSpace(self): register_env("nested", lambda _: NestedDictEnv()) agent = PGAgent(env="nested") agent.train() path = agent.save() agent.stop() # Test train works on restore agent2 = PGAgent(env="nested") agent2.restore(path) agent2.train() # Test rollout works on restore rollout(agent2, "nested", 100)
"num_workers": 0, # Configure the agent to run short iterations for debugging "exploration_fraction": 0.01, "learning_starts": 100, "timesteps_per_iteration": 200, "env_config": { "observation_size": args.observation_size, "action_size": args.action_size, }, }) elif args.run == "PG": agent = PGAgent( env="srv", config={ "num_workers": 0, "env_config": { "observation_size": args.observation_size, "action_size": args.action_size, }, }) # Attempt to restore from checkpoint if possible. if os.path.exists(args.checkpoint_file): checkpoint_file = open(args.checkpoint_file).read() print("Restoring from checkpoint path", checkpoint_file) agent.restore(checkpoint_file) # Serving and training loop while True: print(pretty_print(agent.train())) checkpoint_file = agent.save()
"num_workers": 4, "model": { "custom_model": "mask_model", }, "env_config": { "pymarl_path": path_to_pymarl } } if args.run.lower() == "qmix": def grouped_sc2(cfg): env = SC2MultiAgentEnv(cfg) agent_list = list(range(env._starcraft_env.n_agents)) grouping = { "group_1": agent_list, } obs_space = Tuple([env.observation_space for i in agent_list]) act_space = Tuple([env.action_space for i in agent_list]) return env.with_agent_groups( grouping, obs_space=obs_space, act_space=act_space) register_env("grouped_starcraft", grouped_sc2) agent = QMixAgent(env="grouped_starcraft", config=agent_cfg) elif args.run.lower() == "pg": agent = PGAgent(env="starcraft", config=agent_cfg) elif args.run.lower() == "ppo": agent_cfg.update({"vf_share_layers": True}) agent = PPOAgent(env="starcraft", config=agent_cfg) for i in range(args.num_iters): print(pretty_print(agent.train()))