def _register_if_needed(env_object): if isinstance(env_object, six.string_types): return env_object elif isinstance(env_object, type): name = env_object.__name__ register_env(name, lambda config: env_object(config)) return name
def check_support(alg, config, stats, check_bounds=False): for a_name, action_space in ACTION_SPACES_TO_TEST.items(): for o_name, obs_space in OBSERVATION_SPACES_TO_TEST.items(): print("=== Testing", alg, action_space, obs_space, "===") stub_env = make_stub_env(action_space, obs_space, check_bounds) register_env("stub_env", lambda c: stub_env()) stat = "ok" a = None try: a = get_agent_class(alg)(config=config, env="stub_env") a.train() except UnsupportedSpaceException: stat = "unsupported" except Exception as e: stat = "ERROR" print(e) print(traceback.format_exc()) finally: if a: try: a.stop() except Exception as e: print("Ignoring error stopping agent", e) pass print(stat) print() stats[alg, a_name, o_name] = stat
def testPyTorchModel(self): ModelCatalog.register_custom_model("composite", TorchSpyModel) register_env("nested", lambda _: NestedDictEnv()) a2c = A2CAgent( env="nested", config={ "num_workers": 0, "use_pytorch": True, "sample_batch_size": 5, "train_batch_size": 5, "model": { "custom_model": "composite", }, }) a2c.train() # Check that the model sees the correct reconstructed observations for i in range(4): seen = pickle.loads( ray.experimental.internal_kv._internal_kv_get( "torch_spy_in_{}".format(i))) pos_i = DICT_SAMPLES[i]["sensors"]["position"].tolist() cam_i = DICT_SAMPLES[i]["sensors"]["front_cam"][0].tolist() task_i = one_hot( DICT_SAMPLES[i]["inner_state"]["job_status"]["task"], 5) self.assertEqual(seen[0][0].tolist(), pos_i) self.assertEqual(seen[1][0].tolist(), cam_i) self.assertEqual(seen[2][0].tolist(), task_i)
def doTestNestedTuple(self, make_env): ModelCatalog.register_custom_model("composite2", TupleSpyModel) register_env("nested2", make_env) pg = PGAgent( env="nested2", config={ "num_workers": 0, "sample_batch_size": 5, "train_batch_size": 5, "model": { "custom_model": "composite2", }, }) pg.train() # Check that the model sees the correct reconstructed observations for i in range(4): seen = pickle.loads( ray.experimental.internal_kv._internal_kv_get( "t_spy_in_{}".format(i))) pos_i = TUPLE_SAMPLES[i][0].tolist() cam_i = TUPLE_SAMPLES[i][1][0].tolist() task_i = one_hot(TUPLE_SAMPLES[i][2], 5) self.assertEqual(seen[0][0].tolist(), pos_i) self.assertEqual(seen[1][0].tolist(), cam_i) self.assertEqual(seen[2][0].tolist(), task_i)
def doTestNestedDict(self, make_env, test_lstm=False): ModelCatalog.register_custom_model("composite", DictSpyModel) register_env("nested", make_env) pg = PGAgent( env="nested", config={ "num_workers": 0, "sample_batch_size": 5, "train_batch_size": 5, "model": { "custom_model": "composite", "use_lstm": test_lstm, }, }) pg.train() # Check that the model sees the correct reconstructed observations for i in range(4): seen = pickle.loads( ray.experimental.internal_kv._internal_kv_get( "d_spy_in_{}".format(i))) pos_i = DICT_SAMPLES[i]["sensors"]["position"].tolist() cam_i = DICT_SAMPLES[i]["sensors"]["front_cam"][0].tolist() task_i = one_hot( DICT_SAMPLES[i]["inner_state"]["job_status"]["task"], 5) self.assertEqual(seen[0][0].tolist(), pos_i) self.assertEqual(seen[1][0].tolist(), cam_i) self.assertEqual(seen[2][0].tolist(), task_i)
def testMinibatchSequencing(self): ModelCatalog.register_custom_model("rnn", RNNSpyModel) register_env("counter", lambda _: DebugCounterEnv()) ppo = PPOAgent( env="counter", config={ "num_workers": 0, "sample_batch_size": 20, "train_batch_size": 20, "sgd_minibatch_size": 10, "vf_share_layers": True, "simple_optimizer": False, "num_sgd_iter": 1, "model": { "custom_model": "rnn", "max_seq_len": 4, }, }) ppo.train() ppo.train() # first epoch: 20 observations get split into 2 minibatches of 8 # four observations are discarded batch0 = pickle.loads( ray.experimental.internal_kv._internal_kv_get("rnn_spy_in_0")) batch1 = pickle.loads( ray.experimental.internal_kv._internal_kv_get("rnn_spy_in_1")) if batch0["sequences"][0][0][0] > batch1["sequences"][0][0][0]: batch0, batch1 = batch1, batch0 # sort minibatches self.assertEqual(batch0["seq_lens"].tolist(), [4, 4]) self.assertEqual(batch1["seq_lens"].tolist(), [4, 3]) self.assertEqual(batch0["sequences"].tolist(), [ [[0], [1], [2], [3]], [[4], [5], [6], [7]], ]) self.assertEqual(batch1["sequences"].tolist(), [ [[8], [9], [10], [11]], [[12], [13], [14], [0]], ]) # second epoch: 20 observations get split into 2 minibatches of 8 # four observations are discarded batch2 = pickle.loads( ray.experimental.internal_kv._internal_kv_get("rnn_spy_in_2")) batch3 = pickle.loads( ray.experimental.internal_kv._internal_kv_get("rnn_spy_in_3")) if batch2["sequences"][0][0][0] > batch3["sequences"][0][0][0]: batch2, batch3 = batch3, batch2 self.assertEqual(batch2["seq_lens"].tolist(), [4, 4]) self.assertEqual(batch3["seq_lens"].tolist(), [2, 4]) self.assertEqual(batch2["sequences"].tolist(), [ [[5], [6], [7], [8]], [[9], [10], [11], [12]], ]) self.assertEqual(batch3["sequences"].tolist(), [ [[13], [14], [0], [0]], [[0], [1], [2], [3]], ])
def testTrainCartpole(self): register_env("test", lambda _: SimpleServing(gym.make("CartPole-v0"))) pg = PGAgent(env="test", config={"num_workers": 0}) for i in range(100): result = pg.train() print("Iteration {}, reward {}, timesteps {}".format( i, result["episode_reward_mean"], result["timesteps_total"])) if result["episode_reward_mean"] >= 100: return raise Exception("failed to improve reward")
def testSimpleOptimizerSequencing(self): ModelCatalog.register_custom_model("rnn", RNNSpyModel) register_env("counter", lambda _: DebugCounterEnv()) ppo = PPOAgent( env="counter", config={ "num_workers": 0, "sample_batch_size": 10, "train_batch_size": 10, "sgd_minibatch_size": 10, "vf_share_layers": True, "simple_optimizer": True, "num_sgd_iter": 1, "model": { "custom_model": "rnn", "max_seq_len": 4, }, }) ppo.train() ppo.train() batch0 = pickle.loads( ray.experimental.internal_kv._internal_kv_get("rnn_spy_in_0")) self.assertEqual( batch0["sequences"].tolist(), [[[0], [1], [2], [3]], [[4], [5], [6], [7]], [[8], [9], [0], [0]]]) self.assertEqual(batch0["seq_lens"].tolist(), [4, 4, 2]) self.assertEqual(batch0["state_in"][0][0].tolist(), [0, 0, 0]) self.assertEqual(batch0["state_in"][1][0].tolist(), [0, 0, 0]) self.assertGreater(abs(np.sum(batch0["state_in"][0][1])), 0) self.assertGreater(abs(np.sum(batch0["state_in"][1][1])), 0) self.assertTrue( np.allclose(batch0["state_in"][0].tolist()[1:], batch0["state_out"][0].tolist()[:-1])) self.assertTrue( np.allclose(batch0["state_in"][1].tolist()[1:], batch0["state_out"][1].tolist()[:-1])) batch1 = pickle.loads( ray.experimental.internal_kv._internal_kv_get("rnn_spy_in_1")) self.assertEqual(batch1["sequences"].tolist(), [ [[10], [11], [12], [13]], [[14], [0], [0], [0]], [[0], [1], [2], [3]], [[4], [0], [0], [0]], ]) self.assertEqual(batch1["seq_lens"].tolist(), [4, 1, 4, 1]) self.assertEqual(batch1["state_in"][0][2].tolist(), [0, 0, 0]) self.assertEqual(batch1["state_in"][1][2].tolist(), [0, 0, 0]) self.assertGreater(abs(np.sum(batch1["state_in"][0][0])), 0) self.assertGreater(abs(np.sum(batch1["state_in"][1][0])), 0) self.assertGreater(abs(np.sum(batch1["state_in"][0][1])), 0) self.assertGreater(abs(np.sum(batch1["state_in"][1][1])), 0) self.assertGreater(abs(np.sum(batch1["state_in"][0][3])), 0) self.assertGreater(abs(np.sum(batch1["state_in"][1][3])), 0)
def run(args, parser): def create_environment(env_config): # This import must happen inside the method so that worker processes import this code import roboschool return gym.make(args.env) if not args.config: # Load configuration from file config_dir = os.path.dirname(args.checkpoint) # params.json is saved in the model directory during ray training by default config_path = os.path.join(config_dir, "params.json") with open(config_path) as f: args.config = json.load(f) if not args.env: if not args.config.get("env"): parser.error("the following arguments are required: --env") args.env = args.config.get("env") ray.init() register_env(args.env, create_environment) cls = get_agent_class(args.algorithm) config = args.config config["monitor"] = False config["num_workers"] = 1 config["num_gpus"] = 0 agent = cls(env=args.env, config=config) agent.restore(args.checkpoint) num_episodes = int(args.evaluate_episodes) if args.algorithm == "DQN": env = gym.make(args.env) env = wrap_dqn(env, args.config.get("model", {})) else: env = ModelCatalog.get_preprocessor_as_wrapper(gym.make(args.env)) env = wrappers.Monitor(env, OUTPUT_DIR, force=True, video_callable=lambda episode_id: True) all_rewards = [] for episode in range(num_episodes): steps = 0 state = env.reset() done = False reward_total = 0.0 while not done: action = agent.compute_action(state) next_state, reward, done, _ = env.step(action) reward_total += reward steps += 1 state = next_state all_rewards.append(reward_total) print("Episode reward: %s. Episode steps: %s" % (reward_total, steps)) print("Mean Reward:", np.mean(all_rewards)) print("Max Reward:", np.max(all_rewards)) print("Min Reward:", np.min(all_rewards))
def testTrainMultiCartpoleSinglePolicy(self): n = 10 register_env("multi_cartpole", lambda _: MultiCartpole(n)) pg = PGAgent(env="multi_cartpole", config={"num_workers": 0}) for i in range(100): result = pg.train() print("Iteration {}, reward {}, timesteps {}".format( i, result["episode_reward_mean"], result["timesteps_total"])) if result["episode_reward_mean"] >= 50 * n: return raise Exception("failed to improve reward")
def check_support_multiagent(alg, config): register_env("multi_mountaincar", lambda _: MultiMountainCar(2)) register_env("multi_cartpole", lambda _: MultiCartpole(2)) if "DDPG" in alg: a = get_agent_class(alg)(config=config, env="multi_mountaincar") else: a = get_agent_class(alg)(config=config, env="multi_cartpole") try: a.train() finally: a.stop()
def _register_if_needed(self, env_object): if isinstance(env_object, six.string_types): return env_object elif isinstance(env_object, type): name = env_object.__name__ register_env(name, lambda config: env_object(config)) return name raise ValueError( "{} is an invalid env specification. ".format(env_object) + "You can specify a custom env as either a class " "(e.g., YourEnvCls) or a registered env id (e.g., \"your_env\").")
def testTrainCartpoleOffPolicy(self): register_env( "test3", lambda _: PartOffPolicyServing( gym.make("CartPole-v0"), off_pol_frac=0.2)) dqn = DQNAgent(env="test3", config={"exploration_fraction": 0.001}) for i in range(100): result = dqn.train() print("Iteration {}, reward {}, timesteps {}".format( i, result["episode_reward_mean"], result["timesteps_total"])) if result["episode_reward_mean"] >= 100: return raise Exception("failed to improve reward")
def testQueryEvaluators(self): register_env("test", lambda _: gym.make("CartPole-v0")) pg = PGAgent( env="test", config={ "num_workers": 2, "sample_batch_size": 5 }) results = pg.optimizer.foreach_evaluator( lambda ev: ev.sample_batch_size) results2 = pg.optimizer.foreach_evaluator_with_index( lambda ev, i: (i, ev.sample_batch_size)) self.assertEqual(results, [5, 5, 5]) self.assertEqual(results2, [(0, 5), (1, 5), (2, 5)])
def testMultiAgent(self): register_env("multi_cartpole", lambda _: MultiCartpole(10)) single_env = gym.make("CartPole-v0") def gen_policy(): obs_space = single_env.observation_space act_space = single_env.action_space return (PGPolicyGraph, obs_space, act_space, {}) pg = PGAgent( env="multi_cartpole", config={ "num_workers": 0, "output": self.test_dir, "multiagent": { "policy_graphs": { "policy_1": gen_policy(), "policy_2": gen_policy(), }, "policy_mapping_fn": ( lambda agent_id: random.choice( ["policy_1", "policy_2"])), }, }) pg.train() self.assertEqual(len(os.listdir(self.test_dir)), 1) pg.stop() pg = PGAgent( env="multi_cartpole", config={ "num_workers": 0, "input": self.test_dir, "input_evaluation": ["simulation"], "train_batch_size": 2000, "multiagent": { "policy_graphs": { "policy_1": gen_policy(), "policy_2": gen_policy(), }, "policy_mapping_fn": ( lambda agent_id: random.choice( ["policy_1", "policy_2"])), }, }) for _ in range(50): result = pg.train() if not np.isnan(result["episode_reward_mean"]): return # simulation ok time.sleep(0.1) assert False, "did not see any simulation results"
def testRolloutDictSpace(self): register_env("nested", lambda _: NestedDictEnv()) agent = PGAgent(env="nested") agent.train() path = agent.save() agent.stop() # Test train works on restore agent2 = PGAgent(env="nested") agent2.restore(path) agent2.train() # Test rollout works on restore rollout(agent2, "nested", 100)
def testMultiAgentComplexSpaces(self): ModelCatalog.register_custom_model("dict_spy", DictSpyModel) ModelCatalog.register_custom_model("tuple_spy", TupleSpyModel) register_env("nested_ma", lambda _: NestedMultiAgentEnv()) act_space = spaces.Discrete(2) pg = PGAgent( env="nested_ma", config={ "num_workers": 0, "sample_batch_size": 5, "train_batch_size": 5, "multiagent": { "policy_graphs": { "tuple_policy": ( PGPolicyGraph, TUPLE_SPACE, act_space, {"model": {"custom_model": "tuple_spy"}}), "dict_policy": ( PGPolicyGraph, DICT_SPACE, act_space, {"model": {"custom_model": "dict_spy"}}), }, "policy_mapping_fn": lambda a: { "tuple_agent": "tuple_policy", "dict_agent": "dict_policy"}[a], }, }) pg.train() for i in range(4): seen = pickle.loads( ray.experimental.internal_kv._internal_kv_get( "d_spy_in_{}".format(i))) pos_i = DICT_SAMPLES[i]["sensors"]["position"].tolist() cam_i = DICT_SAMPLES[i]["sensors"]["front_cam"][0].tolist() task_i = one_hot( DICT_SAMPLES[i]["inner_state"]["job_status"]["task"], 5) self.assertEqual(seen[0][0].tolist(), pos_i) self.assertEqual(seen[1][0].tolist(), cam_i) self.assertEqual(seen[2][0].tolist(), task_i) for i in range(4): seen = pickle.loads( ray.experimental.internal_kv._internal_kv_get( "t_spy_in_{}".format(i))) pos_i = TUPLE_SAMPLES[i][0].tolist() cam_i = TUPLE_SAMPLES[i][1][0].tolist() task_i = one_hot(TUPLE_SAMPLES[i][2], 5) self.assertEqual(seen[0][0].tolist(), pos_i) self.assertEqual(seen[1][0].tolist(), cam_i) self.assertEqual(seen[2][0].tolist(), task_i)
def testQueryEvaluators(self): register_env("test", lambda _: gym.make("CartPole-v0")) pg = PGAgent( env="test", config={ "num_workers": 2, "sample_batch_size": 5, "num_envs_per_worker": 2, }) results = pg.optimizer.foreach_evaluator( lambda ev: ev.sample_batch_size) results2 = pg.optimizer.foreach_evaluator_with_index( lambda ev, i: (i, ev.sample_batch_size)) results3 = pg.optimizer.foreach_evaluator( lambda ev: ev.foreach_env(lambda env: 1)) self.assertEqual(results, [10, 10, 10]) self.assertEqual(results2, [(0, 10), (1, 10), (2, 10)]) self.assertEqual(results3, [[1, 1], [1, 1], [1, 1]])
def testTrainMultiCartpoleMultiPolicy(self): n = 10 register_env("multi_cartpole", lambda _: MultiCartpole(n)) single_env = gym.make("CartPole-v0") def gen_policy(): config = { "gamma": random.choice([0.5, 0.8, 0.9, 0.95, 0.99]), "n_step": random.choice([1, 2, 3, 4, 5]), } obs_space = single_env.observation_space act_space = single_env.action_space return (PGPolicyGraph, obs_space, act_space, config) pg = PGAgent( env="multi_cartpole", config={ "num_workers": 0, "multiagent": { "policy_graphs": { "policy_1": gen_policy(), "policy_2": gen_policy(), }, "policy_mapping_fn": lambda agent_id: "policy_1", }, }) # Just check that it runs without crashing for i in range(10): result = pg.train() print("Iteration {}, reward {}, timesteps {}".format( i, result["episode_reward_mean"], result["timesteps_total"])) self.assertTrue( pg.compute_action([0, 0, 0, 0], policy_id="policy_1") in [0, 1]) self.assertTrue( pg.compute_action([0, 0, 0, 0], policy_id="policy_2") in [0, 1]) self.assertRaises( KeyError, lambda: pg.compute_action([0, 0, 0, 0], policy_id="policy_3"))
register( id=env_name, entry_point='ray.rllib.examples:' + "MultiAgentMountainCarEnv", max_episode_steps=200, kwargs={} ) def create_env(env_config): pass_params_to_gym(env_name) env = gym.envs.make(env_name) return env if __name__ == '__main__': register_env(env_name, lambda env_config: create_env(env_config)) config = ppo.DEFAULT_CONFIG.copy() horizon = 10 num_cpus = 4 ray.init(num_cpus=num_cpus, redirect_output=True) config["num_workers"] = num_cpus config["timesteps_per_batch"] = 10 config["num_sgd_iter"] = 10 config["gamma"] = 0.999 config["horizon"] = horizon config["use_gae"] = False config["model"].update({"fcnet_hiddens": [256, 256]}) options = {"multiagent_obs_shapes": [2, 2], "multiagent_act_shapes": [1, 1], "multiagent_shared_model": False, "multiagent_fcnet_hiddens": [[32, 32]] * 2}
return 0 def step(self, action): return 0, 0, True, {} def leaked_processes(): """Returns whether any subprocesses were leaked.""" result = subprocess.check_output( "ps aux | grep '{}' | grep -v grep || true".format(UNIQUE_CMD), shell=True) return result if __name__ == "__main__": register_env("subproc", lambda config: EnvWithSubprocess(config)) ray.init() assert os.path.exists(UNIQUE_FILE_0) assert os.path.exists(UNIQUE_FILE_1) assert not leaked_processes() run_experiments({ "demo": { "run": "PG", "env": "subproc", "num_samples": 1, "config": { "num_workers": 1, }, "stop": { "training_iteration": 1 },
env_name = 'sonic_env' # Note that the hyperparameters have been tuned for sonic, which can be used # run by replacing the below function with: # # register_env(env_name, lambda config: sonic_on_ray.make( # game='SonicTheHedgehog-Genesis', # state='GreenHillZone.Act1')) # # However, to try Sonic, you have to obtain the ROM yourself (see then # instructions at https://github.com/openai/retro/blob/master/README.md). # register_env(env_name, # lambda config: sonic_on_ray.make(game='Airstriker-Genesis', # state='Level1')) register_env(env_name, lambda config: sonic_on_ray.make(game='BustAMove-Snes', state='BustAMove.1pplay.Level10')) ray.init() run_experiments({ 'sonic-ppo': { 'run': 'PPO', 'env': 'sonic_env', # 'trial_resources': { # 'gpu': 2, # note, keep this in sync with 'devices' config value # 'cpu': lambda spec: spec.config.num_workers, # one cpu per worker # }, 'config': { # grid search over learning rate 'sgd_stepsize': grid_search([1e-4, 5e-5, 1e-5, 5e-6]),
def _reset(self): self.cur_pos = 0 return [self.cur_pos] def _step(self, action): assert action in [0, 1] if action == 0 and self.cur_pos > 0: self.cur_pos -= 1 elif action == 1: self.cur_pos += 1 done = self.cur_pos >= self.end_pos return [self.cur_pos], 1 if done else 0, done, {} if __name__ == "__main__": env_creator_name = "corridor" register_env(env_creator_name, lambda config: SimpleCorridor(config)) ray.init() run_experiments({ "demo": { "run": "PPO", "env": "corridor", "config": { "env_config": { "corridor_length": 5, }, }, }, })
def test_no_step_on_init(self): # Allow for Unittest run. ray.init(num_cpus=5, ignore_reinit_error=True) register_env("fail", lambda _: FailOnStepEnv()) pg = PGTrainer(env="fail", config={"num_workers": 1}) self.assertRaises(Exception, lambda: pg.train())
auxiliary_name_scope=False): last_layer = slim.fully_connected( input_dict["obs"], 64, activation_fn=tf.nn.relu, scope="fc1") last_layer = slim.fully_connected( last_layer, 64, activation_fn=tf.nn.relu, scope="fc2") output = slim.fully_connected( last_layer, num_outputs, activation_fn=None, scope="fc_out") return output, last_layer if __name__ == "__main__": args = parser.parse_args() ray.init() # Simple environment with `num_agents` independent cartpole entities register_env("multi_cartpole", lambda _: MultiCartpole(args.num_agents)) ModelCatalog.register_custom_model("model1", CustomModel1) ModelCatalog.register_custom_model("model2", CustomModel2) single_env = gym.make("CartPole-v0") obs_space = single_env.observation_space act_space = single_env.action_space # Each policy can have a different configuration (including custom model) def gen_policy(i): config = { "model": { "custom_model": ["model1", "model2"][i % 2], }, "gamma": random.choice([0.95, 0.99]), } return (PPOPolicyGraph, obs_space, act_space, config)
return parser.parse_args() if __name__ == '__main__': args = get_parser() # Start ray ray.init() # NOTE: We are using DuckietownLF environment because SteeringToWheelVelWrapper does not cooperate with multimap. ModelCatalog.register_custom_model( "image-ddpg", DDPGRLLibModel, ) register_env("DuckieTown-MultiMap", lambda _: MultiMapSteeringToWheelVelWrapper(MultiMapEnv())) csv_path = "searches/ddpg_results.csv" starting_idx = 0 if os.path.exists(csv_path): with open(csv_path, mode="r") as f: starting_idx = len(f.readlines()) for search_idx in trange(args.n_searches, desc="Searches"): config = { "framework": "torch", "model": { "custom_model": "image-ddpg", }, # "use_state_preprocessor": True, "learning_starts": 0,
spaces.Box( low=-10, high=10, shape=(config["observation_size"],), dtype=np.float32)) def run(self): print("Starting policy server at {}:{}".format(SERVER_ADDRESS, SERVER_PORT)) server = PolicyServer(self, SERVER_ADDRESS, SERVER_PORT) server.serve_forever() if __name__ == "__main__": args = parser.parse_args() ray.init() register_env("srv", lambda config: SimpleServing(config)) if args.run == "DQN": trainer = DQNTrainer( env="srv", config={ # Use a single process to avoid needing a load balancer "num_workers": 0, # Configure the trainer to run short iterations for debugging "exploration_fraction": 0.01, "learning_starts": 100, "timesteps_per_iteration": 200, "env_config": { "observation_size": args.observation_size, "action_size": args.action_size, },
import ray from ray.tune.registry import register_env from ray.tune import run_experiments from ray.rllib.agents import ppo def env_creator(env_config={}): env = QuadLocEnv(dataDir='/home/Pearl/quantm/RL_env/data/', num=500) num = env.action_space.n # print("Action:", num) env.reset() return env register_env("QuadLocEnv-v0", env_creator) ############ ray.init(use_raylet=True, redis_password=os.urandom(128).hex()) register_env("QuadLocEnv-v0", env_creator) experiment_spec = { "custom_env": { "run": "A3C", "env": "QuadLocEnv-v0", # "restore": checkpoint, "config": { "model": { "custom_model": "ConvNet2D", }, },
# save the flow params for replay flow_json = json.dumps(flow_params, cls=FlowParamsEncoder, sort_keys=True, indent=4) # generating a string version of flow_params config['env_config'][ 'flow_params'] = flow_json # adding the flow_params to config dict config['env_config']['run'] = alg_run # Call the utility function make_create_env to be able to # register the Flow env for this experiment create_env, gym_name = make_create_env(params=flow_params, version=0) config["env"] = gym_name # Register as rllib env with Gym register_env(gym_name, create_env) exp = Experiment( flow_params["exp_tag"], **{ "run": alg_run, "config": { **config }, "checkpoint_freq": 5, # number of iterations between checkpoints "checkpoint_at_end": True, # generate a checkpoint at the end "max_failures": 5, "stop": { # stopping conditions "training_iteration": 400, # number of iterations to stop after }, "num_samples": 1,
def run(args, parser): def create_environment(env_config): return gym.make(args.env) if not args.config: # Load configuration from file config_dir = os.path.dirname(args.checkpoint) # params.json is saved in the model directory during ray training by default config_path = os.path.join(config_dir, "params.json") with open(config_path) as f: args.config = json.load(f) if not args.env: if not args.config.get("env"): parser.error("the following arguments are required: --env") args.env = args.config.get("env") ray.init() register_env(args.env, create_environment) if ray.__version__ >= "0.6.5": from ray.rllib.agents.registry import get_agent_class else: from ray.rllib.agents.agent import get_agent_class cls = get_agent_class(args.algorithm) config = args.config config["monitor"] = False config["num_workers"] = 1 config["num_gpus"] = 0 agent = cls(env=args.env, config=config) agent.restore(args.checkpoint) num_episodes = int(args.evaluate_episodes) if ray.__version__ >= "0.6.5": env = gym.make(args.env) else: from ray.rllib.agents.dqn.common.wrappers import wrap_dqn if args.algorithm == "DQN": env = gym.make(args.env) env = wrap_dqn(env, args.config.get("model", {})) else: env = ModelCatalog.get_preprocessor_as_wrapper(gym.make(args.env)) env = wrappers.Monitor(env, OUTPUT_DIR, force=True, video_callable=lambda episode_id: True) all_rewards = [] for episode in range(num_episodes): steps = 0 state = env.reset() done = False reward_total = 0.0 while not done: action = agent.compute_action(state) next_state, reward, done, _ = env.step(action) reward_total += reward steps += 1 state = next_state all_rewards.append(reward_total) print("Episode reward: %s. Episode steps: %s" % (reward_total, steps)) print("Mean Reward:", np.mean(all_rewards)) print("Max Reward:", np.max(all_rewards)) print("Min Reward:", np.min(all_rewards))
from ray.tune.registry import register_env from wanderer_roborobo import WandererRoborobo if __name__ == "__main__": ray.init(num_cpus=1, num_gpus=1) #%% n_players = 1 max_moves = 1000 agents_id = ['player{:d}'.format(i) for i in range(n_players)] actions = {agents_id[i]: 1 for i in range(n_players)} register_env("wanderer_roborobo", lambda _: WandererRoborobo(n_players, max_moves)) act_space = WandererRoborobo.action_space obs_space = WandererRoborobo.observation_space policies = { agents_id[i]: (None, obs_space, act_space, {}) for i in range(n_players) } def select_policy(agent_id): return agent_id config = { "num_gpus": 0, 'num_workers': 0,
from ray.tune.registry import register_env from nes_py.wrappers import JoypadSpace from ray.rllib.env.atari_wrappers import WarpFrame import gym_tetris from gym_tetris.actions import MOVEMENT def tetris_env_creator(version="TetrisA-v0"): def env_creator(env_config): env = gym_tetris.make(version) env = JoypadSpace(env, MOVEMENT) env = WarpFrame(env, dim=84) return env return env_creator register_env("TetrisA-v0", tetris_env_creator("TetrisA-v0")) register_env("TetrisA-v1", tetris_env_creator("TetrisA-v1")) register_env("TetrisA-v2", tetris_env_creator("TetrisA-v2")) register_env("TetrisA-v3", tetris_env_creator("TetrisA-v3"))
sgd_minibatch_size=128)) # Combined training flow train_op = Concurrently([ppo_train_op, dqn_train_op], mode="async", output_indexes=[1]) return StandardMetricsReporting(train_op, workers, config) if __name__ == "__main__": args = parser.parse_args() ray.init() # Simple environment with 4 independent cartpole entities register_env("multi_agent_cartpole", lambda _: MultiAgentCartPole({"num_agents": 4})) single_env = gym.make("CartPole-v0") obs_space = single_env.observation_space act_space = single_env.action_space # Note that since the trainer below does not include a default policy or # policy configs, we have to explicitly set it in the multiagent config: policies = { "ppo_policy": (PPOTFPolicy, obs_space, act_space, PPO_CONFIG), "dqn_policy": (DQNTFPolicy, obs_space, act_space, DQN_CONFIG), } def policy_mapping_fn(agent_id): if agent_id % 2 == 0: return "ppo_policy" else:
self.dts_taken_so_far = 1 return self.env.env.robot.calc_state() #return state of robot def step(self, action): input("Press Enter .....") print("Colisions for feet: ", self.env.env.robot.calc_state()[20], " ", self.env.env.robot.calc_state()[21]) #returns states, last 2 numbers inticate whther foot is in contact with ground self.dts_taken_so_far += 1 if self.debug: print("Time elapsed in episode: ", self.dts_taken_so_far * self.env.env.scene.dt) print("Number of dt's taken in episode: " , self.dts_taken_so_far) return self.env.step(action) from ray.tune.registry import register_env register_env("cartpolebulletenv", lambda config: MultiEnv(config)) register_env("reacherbulletenv", lambda config: ReacherEnv(config)) register_env("pusherbulletenv", lambda config: PusherEnv(config)) register_env("throwerbulletenv", lambda config: ThrowerEnv(config)) register_env("strikerbulletenv", lambda config: StrikerEnv(config)) register_env("walkerbulletenv", lambda config: WalkerEnv(config)) #register_env("octoenv", lambda config: OctoEnv(config)) #trainer = ppo.PPOTrainer(config=config, env="octoenv") ########### class RolloutSaver: """Utility class for storing rollouts.
def main(argv): ModelCatalog.register_custom_model("my_model", MyModelClass) model = { # cusom model options "custom_model": "my_model", "custom_preprocessor": None, # Extra options to pass to the custom classes "custom_options": {}, # built in options # Number of hidden layers for fully connected net "fcnet_hiddens": [256, 256, 256, 256], } num_workers = 2 # read out command line arguments try: opts, args = getopt.getopt(argv, "hn:", ["number-worker="]) except getopt.GetoptError: print('ray_server.py -n <number-worker>') sys.exit(2) for opt, arg in opts: if opt == '-h': print('ray_server.py -n <number-worker>') print('-n --number-worker - number of worker to start') sys.exit() elif opt in ("-n", "--number-worker"): num_workers = int(arg) ray.init() print("[RAY] Initialized") register_env("srv", lambda _: CartpoleServing()) if ALGORITHM == "APEX": dqn = ApexTrainer( env="srv", config={ # model "model": model, "gamma": 0.8, "noisy": False, "num_gpus": 1, # evaluation # everything default, see dqn.py #exploration "target_network_update_freq": 500000, # rest: everything default, see dqn.py #replay buffer # Size of the replay buffer. Note that if async_updates is set, then # each worker will have a replay buffer of this size. default 50000 "buffer_size": 2000000, # If True prioritized replay buffer will be used. "prioritized_replay": True, # here are many parameters, untouched from me (see dqn.py) # Optimization # Learning rate - defaults to 5e-4 "lr": 0.01, # Size of rollout batch # Default sample batch size (unroll length). Batches of this size are # collected from workers until train_batch_size is met. When using # multiple envs per worker, this is multiplied by num_envs_per_worker. "sample_batch_size": 4, # Training batch size, if applicable. Should be >= sample_batch_size. # Samples batches will be concatenated together to this size for training. "train_batch_size": 256, # How many steps of the model to sample before learning starts "learning_starts": 50000, #parallelism "num_workers": num_workers, # distribute epsilon over workers (default for apex) "per_worker_exploration": True, # determine per worker which experience should be prioritized, before giving those to the # shared experience memory "worker_side_prioritization": True, # "schedule_max_timesteps": 100000, # was tut es? # "timesteps_per_iteration": 25000, # was tut es? # "min_iter_time_s": 30, # was tut es? }) else: dqn = DQNTrainer( env="srv", config={ # model # mehrere Threads fuer worker! fuer debugging auf false setzen # "sample_async": True, # "grad_clip": 0.5, "model": model, "gamma": 0.8, "noisy": False, "num_gpus": 1, # Whether to use dueling dqn "dueling": True, # Whether to use double dqn "double_q": True, # evaluation # everything default, see dqn.py # exploration "target_network_update_freq": 500000, # rest: everything default, see dqn.py # replay buffer # Size of the replay buffer. Note that if async_updates is set, then # each worker will have a replay buffer of this size. default 50000 "buffer_size": 2000000, # If True prioritized replay buffer will be used. "prioritized_replay": True, # here are many parameters, untouched from me (see dqn.py) # Optimization # Learning rate - defaults to 5e-4 "lr": 0.01, # Update the replay buffer with this many samples at once. Note that # this setting applies per-worker if num_workers > 1. #"sample_batch_size": 1024, # How many steps of the model to sample before learning starts "learning_starts": 50000, # Size of a batched sampled from replay buffer for training. Note that # if async_updates is set, then each worker returns gradients for a # batch of this size. (Minibatch size) hould be >= sample_batch_size # Samples batches will be concatenated together to this size for training. "train_batch_size": 256, # parallelism # Number of workers for collecting samples with. This only makes sense # to increase if your environment is particularly slow to sample, or if # you"re using the Async or Ape-X optimizers. "num_workers": num_workers, # distribute epsilon over workers "per_worker_exploration": True, # compute worker side prioritazation (DQN: False, DDQN: True, APEX: True!!) "worker_side_prioritization": True, }) # write policy graph to tensorboard (for debugging purposes) policy_graph = dqn.local_evaluator.policy_map["default_policy"].sess.graph writer = tf.summary.FileWriter(dqn._result_logger.logdir, policy_graph) writer.close() # Attempt to restore from checkpoint if possible. if os.path.exists(CHECKPOINT_FILE): checkpoint_path = open(CHECKPOINT_FILE).read() print("Restoring from checkpoint path", checkpoint_path) dqn.restore(checkpoint_path) # Serving and training loop while True: print(pretty_print(dqn.train())) checkpoint_path = dqn.save() print("Last checkpoint", checkpoint_path) with open(CHECKPOINT_FILE, "w") as f: f.write(checkpoint_path)
rllib rollout /tmp/ray/checkpoint_dir/checkpoint-0 --run DQN --env CartPole-v0 --steps 1000000 --out rollouts.pkl Example Usage via executable: ./rollout.py /tmp/ray/checkpoint_dir/checkpoint-0 --run DQN --env CartPole-v0 --steps 1000000 --out rollouts.pkl """ ENV = "PathPlanningEnv" if ENV == "CarlaRoadEnv": def env_creator(env_config): env = CarlaRoadEnv(env_config) return env register_env("CarlaRoadEnv-v0", env_creator) ModelCatalog.register_custom_model("carla_road_model", FactoredModel) else: def env_creator(env_config): env = PathPlanningEnv(env_config) return env register_env("PathPlanningEnv-v0", env_creator) ModelCatalog.register_custom_model("path_planning_model", PathPlanningModel) # Note: if you use any custom models or envs, register them here first, e.g.: # # ModelCatalog.register_custom_model("pa_model", ParametricActionsModel) # register_env("pa_cartpole", lambda _: ParametricActionCartpole(10))
def visualizer_rllib(args): """Visualizer for RLlib experiments. This function takes args (see function create_parser below for more detailed information on what information can be fed to this visualizer), and renders the experiment associated with it. """ result_dir = args.result_dir if args.result_dir[-1] != '/' \ else args.result_dir[:-1] config = get_rllib_config(result_dir) # TODO(ev) backwards compatibility hack try: pkl = get_rllib_pkl(result_dir) except Exception: pass # check if we have a multiagent scenario but in a # backwards compatible way if config.get('multiagent', {}).get('policy_graphs', {}): multiagent = True config['multiagent'] = pkl['multiagent'] else: multiagent = False # Run on only one cpu for rendering purposes config['num_workers'] = 0 flow_params = get_flow_params(config) # hack for old pkl files # TODO(ev) remove eventually sim_params = flow_params['sim'] setattr(sim_params, 'num_clients', 1) # Determine agent and checkpoint config_run = config['env_config']['run'] if 'run' in config['env_config'] \ else None if args.run and config_run: if args.run != config_run: print('visualizer_rllib.py: error: run argument ' + '\'{}\' passed in '.format(args.run) + 'differs from the one stored in params.json ' + '\'{}\''.format(config_run)) sys.exit(1) if args.run: agent_cls = get_agent_class(args.run) elif config_run: agent_cls = get_agent_class(config_run) else: print('visualizer_rllib.py: error: could not find flow parameter ' '\'run\' in params.json, ' 'add argument --run to provide the algorithm or model used ' 'to train the results\n e.g. ' 'python ./visualizer_rllib.py /tmp/ray/result_dir 1 --run PPO') sys.exit(1) sim_params.restart_instance = True dir_path = os.path.dirname(os.path.realpath(__file__)) emission_path = '{0}/test_time_rollout/'.format(dir_path) sim_params.emission_path = emission_path if args.gen_emission else None # pick your rendering mode if args.render_mode == 'sumo_web3d': sim_params.num_clients = 2 sim_params.render = False elif args.render_mode == 'drgb': sim_params.render = 'drgb' sim_params.pxpm = 4 elif args.render_mode == 'sumo_gui': sim_params.render = True print('NOTE: With render mode {}, an extra instance of the SUMO GUI ' 'will display before the GUI for visualizing the result. Click ' 'the green Play arrow to continue.'.format(args.render_mode)) elif args.render_mode == 'no_render': sim_params.render = False if args.save_render: sim_params.render = 'drgb' sim_params.pxpm = 4 sim_params.save_render = True # Create and register a gym+rllib env create_env, env_name = make_create_env(params=flow_params, version=0) register_env(env_name, create_env) # check if the environment is a single or multiagent environment, and # get the right address accordingly # single_agent_envs = [env for env in dir(flow.envs) # if not env.startswith('__')] # if flow_params['env_name'] in single_agent_envs: # env_loc = 'flow.envs' # else: # env_loc = 'flow.multiagent_envs' # Start the environment with the gui turned on and a path for the # emission file env_params = flow_params['env'] env_params.restart_instance = False if args.evaluate: env_params.evaluate = True # lower the horizon if testing if args.horizon: config['horizon'] = args.horizon env_params.horizon = args.horizon # create the agent that will be used to compute the actions agent = agent_cls(env=env_name, config=config) checkpoint = result_dir + '/checkpoint_' + args.checkpoint_num checkpoint = checkpoint + '/checkpoint-' + args.checkpoint_num agent.restore(checkpoint) if hasattr(agent, "local_evaluator") and \ os.environ.get("TEST_FLAG") != 'True': env = agent.local_evaluator.env else: env = gym.make(env_name) if multiagent: rets = {} # map the agent id to its policy policy_map_fn = config['multiagent']['policy_mapping_fn'].func for key in config['multiagent']['policy_graphs'].keys(): rets[key] = [] else: rets = [] if config['model']['use_lstm']: use_lstm = True if multiagent: state_init = {} # map the agent id to its policy policy_map_fn = config['multiagent']['policy_mapping_fn'].func size = config['model']['lstm_cell_size'] for key in config['multiagent']['policy_graphs'].keys(): state_init[key] = [ np.zeros(size, np.float32), np.zeros(size, np.float32) ] else: state_init = [ np.zeros(config['model']['lstm_cell_size'], np.float32), np.zeros(config['model']['lstm_cell_size'], np.float32) ] else: use_lstm = False env.restart_simulation(sim_params=sim_params, render=sim_params.render) # Simulate and collect metrics final_outflows = [] final_inflows = [] mean_speed = [] std_speed = [] for i in range(args.num_rollouts): vel = [] state = env.reset() if multiagent: ret = {key: [0] for key in rets.keys()} else: ret = 0 for j in range(env_params.horizon): vehicles = env.unwrapped.k.vehicle vel.append(np.mean(vehicles.get_speed(vehicles.get_ids()))) if multiagent: action = {} for agent_id in state.keys(): if use_lstm: action[agent_id], state_init[agent_id], logits = \ agent.compute_action( state[agent_id], state=state_init[agent_id], policy_id=policy_map_fn(agent_id)) else: action[agent_id] = agent.compute_action( state[agent_id], policy_id=policy_map_fn(agent_id)) if j == 0: action[agent_id] = 0.2 # to prevent accident #print("hello, ", j, action[agent_id]) else: action = agent.compute_action(state) state, reward, done, _ = env.step(action) if multiagent: for actor, rew in reward.items(): ret[policy_map_fn(actor)][0] += rew else: ret += reward if multiagent and done['__all__']: break if not multiagent and done: break if multiagent: for key in rets.keys(): rets[key].append(ret[key]) else: rets.append(ret) outflow = vehicles.get_outflow_rate(500) final_outflows.append(outflow) inflow = vehicles.get_inflow_rate(500) final_inflows.append(inflow) if np.all(np.array(final_inflows) > 1e-5): throughput_efficiency = [ x / y for x, y in zip(final_outflows, final_inflows) ] else: throughput_efficiency = [0] * len(final_inflows) mean_speed.append(np.mean(vel)) std_speed.append(np.std(vel)) if multiagent: for agent_id, rew in rets.items(): print('Round {}, Return: {} for agent {}'.format( i, ret, agent_id)) else: print('Round {}, Return: {}'.format(i, ret)) print('==== Summary of results ====') print("Return:") print(mean_speed) if multiagent: for agent_id, rew in rets.items(): print('For agent', agent_id) print(rew) print('Average, std return: {}, {} for agent {}'.format( np.mean(rew), np.std(rew), agent_id)) else: print(rets) print('Average, std: {}, {}'.format(np.mean(rets), np.std(rets))) print("\nSpeed, mean (m/s):") print(mean_speed) print('Average, std: {}, {}'.format(np.mean(mean_speed), np.std(mean_speed))) print("\nSpeed, std (m/s):") print(std_speed) print('Average, std: {}, {}'.format(np.mean(std_speed), np.std(std_speed))) # Compute arrival rate of vehicles in the last 500 sec of the run print("\nOutflows (veh/hr):") print(final_outflows) print('Average, std: {}, {}'.format(np.mean(final_outflows), np.std(final_outflows))) # Compute departure rate of vehicles in the last 500 sec of the run print("Inflows (veh/hr):") print(final_inflows) print('Average, std: {}, {}'.format(np.mean(final_inflows), np.std(final_inflows))) # Compute throughput efficiency in the last 500 sec of the print("Throughput efficiency (veh/hr):") print(throughput_efficiency) print('Average, std: {}, {}'.format(np.mean(throughput_efficiency), np.std(throughput_efficiency))) # terminate the environment env.unwrapped.terminate() # if prompted, convert the emission file into a csv file if args.gen_emission: time.sleep(0.1) dir_path = os.path.dirname(os.path.realpath(__file__)) emission_filename = '{0}-emission.xml'.format(env.scenario.name) emission_path = \ '{0}/test_time_rollout/{1}'.format(dir_path, emission_filename) # convert the emission file into a csv file emission_to_csv(emission_path) # delete the .xml version of the emission file os.remove(emission_path) # if we wanted to save the render, here we create the movie if args.save_render: dirs = os.listdir(os.path.expanduser('~') + '/flow_rendering') # Ignore hidden files dirs = [d for d in dirs if d[0] != '.'] dirs.sort(key=lambda date: datetime.strptime(date, "%Y-%m-%d-%H%M%S")) recent_dir = dirs[-1] # create the movie movie_dir = os.path.expanduser('~') + '/flow_rendering/' + recent_dir save_dir = os.path.expanduser('~') + '/flow_movies' if not os.path.exists(save_dir): os.mkdir(save_dir) os_cmd = "cd " + movie_dir + " && ffmpeg -i frame_%06d.png" os_cmd += " -pix_fmt yuv420p " + dirs[-1] + ".mp4" os_cmd += "&& cp " + dirs[-1] + ".mp4 " + save_dir + "/" os.system(os_cmd)
self.obs_in = input_dict["obs"] self.fcnet = FullyConnectedNetwork(input_dict, self.obs_space, self.action_space, num_outputs, options) return self.fcnet.outputs, self.fcnet.last_layer if __name__ == "__main__": board_sizes = (3, 3) diff = 2 st = 100000000 env_name = 'puzzle-v0' #my_board = gym.make('gym_puzzle:puzzle-v0') register_env(env_name, lambda config: PuzzleEnv(config)) ray.init() # ModelCatalog.register_custom_model("my_model", CustomModel) tune.run( "PPO", stop={ #"timesteps_total": 10000, #"episode_len_mean": 20.0, "training_iteration": 50, }, config={ "env": "puzzle-v0", # or "puzzle-v0" if registered above # "model": # "custom_model": "my_model", # },
# Batch dot product => shape of logits is [BATCH, MAX_ACTIONS]. action_logits = tf.reduce_sum(avail_actions * intent_vector, axis=2) # Mask out invalid actions (use tf.float32.min for stability) inf_mask = tf.maximum(tf.log(action_mask), tf.float32.min) masked_logits = inf_mask + action_logits return masked_logits, last_layer if __name__ == "__main__": args = parser.parse_args() ray.init() ModelCatalog.register_custom_model("pa_model", ParametricActionsModel) register_env("pa_cartpole", lambda _: ParametricActionCartpole(10)) if args.run == "PPO": cfg = { "observation_filter": "NoFilter", # don't filter the action list "vf_share_layers": True, # don't create duplicate value model } elif args.run == "DQN": cfg = { "hiddens": [], # important: don't postprocess the action scores } else: cfg = {} # PG, IMPALA, A2C, etc. run_experiments({ "parametric_cartpole": { "run": args.run, "env": "pa_cartpole",
def register_env_creator(self): register_env("RoboschoolReacher-v1", create_environment)
# Box(low=-1, high=1000, shape=(31,), dtype=np.float) Box(np.array(lower_bounds), np.array(upper_bounds))) def run(self): print("Starting policy server at {}:{}".format(SERVER_ADDRESS, SERVER_PORT)) server = PolicyServer(self, SERVER_ADDRESS, SERVER_PORT) server.serve_forever() if __name__ == "__main__": # ray.init(redis_max_memory=10000000000, object_store_memory=3000000000, memory=2000000000) ray.init() register_env("srv", lambda _: MarketServing()) # We use DQN since it supports off-policy actions, but you can choose and # configure any agent. # dqn = PGTrainer( # env="srv", # config={ # # Use a single process to avoid needing to set up a load balancer # # "num_workers": 0, # "evaluation_num_episodes": 1, # # "sample_batch_size": 40, # # "train_batch_size": 40, # # "horizon": 40, # "sample_batch_size": 15, # "train_batch_size": 128, # })
class CartpoleServing(ExternalEnv): def __init__(self): ExternalEnv.__init__( self, spaces.Discrete(2), spaces.Box(low=-10, high=10, shape=(4, ), dtype=np.float32)) def run(self): print("Starting policy server at {}:{}".format(SERVER_ADDRESS, SERVER_PORT)) server = PolicyServer(self, SERVER_ADDRESS, SERVER_PORT) server.serve_forever() if __name__ == "__main__": ray.init() register_env("srv", lambda _: CartpoleServing()) # We use DQN since it supports off-policy actions, but you can choose and # configure any agent. dqn = DQNTrainer( env="srv", config={ # Use a single process to avoid needing to set up a load balancer "num_workers": 0, # Configure the agent to run short iterations for debugging "exploration_fraction": 0.01, "learning_starts": 100, "timesteps_per_iteration": 200, }) # Attempt to restore from checkpoint if possible.
def reset(self): self.cur_pos = 0 return [self.cur_pos] def step(self, action): assert action in [0, 1], action if action == 0 and self.cur_pos > 0: self.cur_pos -= 1 elif action == 1: self.cur_pos += 1 done = self.cur_pos >= self.end_pos return [self.cur_pos], 1 if done else 0, done, {} if __name__ == "__main__": print("CORRIDOR TEST") env_creator_name = "corridor" register_env(env_creator_name, lambda config: SimpleCorridor(config)) ray.init() run_experiments({ "demo": { "run": "PPO", "env": "corridor", "config": { "env_config": { "corridor_length": 5, }, }, }, })
def register_env_creator(self): register_env("RoboschoolHumanoid-v1", create_environment)
rewards.append(reward) return { "episode_reward_mean": np.mean(rewards), "timesteps_this_iter": steps, } import ray from ray import tune from ray.rllib.utils.seed import seed as rllib_seed import rl_toy from rl_toy.envs import RLToyEnv from ray.tune.registry import register_env register_env("RLToy-v0", lambda config: RLToyEnv(config)) from ray.rllib.models.preprocessors import OneHotPreprocessor from ray.rllib.models import ModelCatalog ModelCatalog.register_custom_preprocessor("ohe", OneHotPreprocessor) #rllib_seed(0, 0, 0) ####IMP Doesn't work due to multi-process I think; so use config["seed"] # np.random.seed(0) # import random # random.seed(0) # import tensorflow as tf # tf.set_random_seed(0)
from ray.rllib.agents.dqn.dqn_policy_graph import DQNPolicyGraph from ray.rllib.agents.ppo.ppo import PPOAgent from ray.rllib.agents.ppo.ppo_policy_graph import PPOPolicyGraph from ray.rllib.tests.test_multi_agent_env import MultiCartpole from ray.tune.logger import pretty_print from ray.tune.registry import register_env parser = argparse.ArgumentParser() parser.add_argument("--num-iters", type=int, default=20) if __name__ == "__main__": args = parser.parse_args() ray.init() # Simple environment with 4 independent cartpole entities register_env("multi_cartpole", lambda _: MultiCartpole(4)) single_env = gym.make("CartPole-v0") obs_space = single_env.observation_space act_space = single_env.action_space # You can also have multiple policy graphs per trainer, but here we just # show one each for PPO and DQN. policy_graphs = { "ppo_policy": (PPOPolicyGraph, obs_space, act_space, {}), "dqn_policy": (DQNPolicyGraph, obs_space, act_space, {}), } def policy_mapping_fn(agent_id): if agent_id % 2 == 0: return "ppo_policy" else:
def setup_exps_rllib(flow_params, n_cpus, n_rollouts, policy_graphs=None, policy_mapping_fn=None, policies_to_train=None): """Return the relevant components of an RLlib experiment. Parameters ---------- flow_params : dict flow-specific parameters (see flow/utils/registry.py) n_cpus : int number of CPUs to run the experiment over n_rollouts : int number of rollouts per training iteration policy_graphs : dict, optional TODO policy_mapping_fn : function, optional TODO policies_to_train : list of str, optional TODO Returns ------- str name of the training algorithm str name of the gym environment to be trained dict training configuration parameters """ from ray import tune from ray.tune.registry import register_env try: from ray.rllib.agents.agent import get_agent_class except ImportError: from ray.rllib.agents.registry import get_agent_class horizon = flow_params['env'].horizon alg_run = "PPO" agent_cls = get_agent_class(alg_run) config = deepcopy(agent_cls._default_config) config["num_workers"] = n_cpus config["train_batch_size"] = horizon * n_rollouts config["gamma"] = 0.999 # discount rate config["model"].update({"fcnet_hiddens": [32, 32, 32]}) config["use_gae"] = True config["lambda"] = 0.97 config["kl_target"] = 0.02 config["num_sgd_iter"] = 10 config["horizon"] = horizon # save the flow params for replay flow_json = json.dumps(flow_params, cls=FlowParamsEncoder, sort_keys=True, indent=4) config['env_config']['flow_params'] = flow_json config['env_config']['run'] = alg_run # multiagent configuration if policy_graphs is not None: print("policy_graphs", policy_graphs) config['multiagent'].update({'policies': policy_graphs}) if policy_mapping_fn is not None: config['multiagent'].update( {'policy_mapping_fn': tune.function(policy_mapping_fn)}) if policies_to_train is not None: config['multiagent'].update({'policies_to_train': policies_to_train}) create_env, gym_name = make_create_env(params=flow_params) # Register as rllib env register_env(gym_name, create_env) return alg_run, gym_name, config
def testNoStepOnInit(self): register_env("fail", lambda _: FailOnStepEnv()) pg = PGAgent(env="fail", config={"num_workers": 1}) self.assertRaises(Exception, lambda: pg.train())
def __init__(self): super(FeedingPandaEnv, self).__init__(robot=Panda(robot_arm), human=Human(human_controllable_joint_indices, controllable=False)) class FeedingPR2HumanEnv(FeedingEnv, MultiAgentEnv): def __init__(self): super(FeedingPR2HumanEnv, self).__init__(robot=PR2(robot_arm), human=Human(human_controllable_joint_indices, controllable=True)) register_env('assistive_gym:FeedingPR2Human-v1', lambda config: FeedingPR2HumanEnv()) class FeedingBaxterHumanEnv(FeedingEnv, MultiAgentEnv): def __init__(self): super(FeedingBaxterHumanEnv, self).__init__(robot=Baxter(robot_arm), human=Human(human_controllable_joint_indices, controllable=True)) register_env('assistive_gym:FeedingBaxterHuman-v1', lambda config: FeedingBaxterHumanEnv()) class FeedingSawyerHumanEnv(FeedingEnv, MultiAgentEnv):
"""Integration test: (1) pendulum works, (2) single-agent multi-agent works.""" from __future__ import absolute_import from __future__ import division from __future__ import print_function import ray from ray.rllib.test.test_multi_agent_env import make_multiagent from ray.tune import run_experiments from ray.tune.registry import register_env if __name__ == "__main__": ray.init() MultiPendulum = make_multiagent("Pendulum-v0") register_env("multi_pend", lambda _: MultiPendulum(1)) trials = run_experiments({ "test": { "run": "PPO", "env": "multi_pend", "stop": { "timesteps_total": 500000, "episode_reward_mean": -200, }, "config": { "train_batch_size": 2048, "vf_clip_param": 10.0, "num_workers": 0, "num_envs_per_worker": 10, "lambda": 0.1, "gamma": 0.95, "lr": 0.0003,
def setup_exps_rllib(flow_params, n_cpus, n_rollouts, policy_graphs=None, policy_mapping_fn=None, policies_to_train=None, flags=None): from ray import tune from ray.tune.registry import register_env try: from ray.rllib.agents.agent import get_agent_class except ImportError: from ray.rllib.agents.registry import get_agent_class import torch horizon = flow_params['env'].horizon from ray.rllib.agents.ddpg.ddpg import DEFAULT_CONFIG alg_run = "DDPG" agent_cls = get_agent_class(alg_run) config = deepcopy(agent_cls._default_config) config["num_workers"] = 1 # model config['n_step'] = 1 config['actor_hiddens'] = [64, 64] config['actor_lr'] = 0.0001 # in article 'ddpg' config['critic_lr'] = 0.0001 config['critic_hiddens'] = [64, 64] config['gamma'] = 0.99 config['model']['fcnet_hiddens'] = [64, 64] config['lr'] = 1e-5 # exploration config['exploration_config']['final_scale'] = 0.05 config['exploration_config']['scale_timesteps'] = 1500000 config['exploration_config']['ou_base_scale'] = 0.1 config['exploration_config']['ou_theta'] = 0.15 config['exploration_config']['ou_sigma'] = 0.2 # optimization config['tau'] = 0.001 config['l2_reg'] = 1e-6 config['train_batch_size'] = 64 config['learning_starts'] = 3000 # evaluation #config['evaluation_interval'] = 5 config['buffer_size'] = 300000 #3e5 config['timesteps_per_iteration'] = 3000 config['prioritized_replay'] = False #common config config['framework'] = 'torch' config['callbacks'] = { "on_episode_end": None, "on_episode_start": None, "on_episode_step": None, "on_postprocess_traj": None, "on_sample_end": None, "on_train_result": None } # config["opt_type"]= "adam" for impala and APPO, default is SGD # TrainOneStep class call SGD -->execution_plan function can have policy update function print("cuda is available: ", torch.cuda.is_available()) print('Beginning training.') print("==========================================") print("running algorithm: ", alg_run) # "Framework: ", "torch" # save the flow params for replay flow_json = json.dumps(flow_params, cls=FlowParamsEncoder, sort_keys=True, indent=4) config['env_config']['flow_params'] = flow_json config['env_config']['run'] = alg_run # multiagent configuration if policy_graphs is not None: print("policy_graphs", policy_graphs) config['multiagent'].update({'policies': policy_graphs}) if policy_mapping_fn is not None: config['multiagent'].update( {'policy_mapping_fn': tune.function(policy_mapping_fn)}) if policies_to_train is not None: config['multiagent'].update({'policies_to_train': policies_to_train}) create_env, gym_name = make_create_env(params=flow_params) # Register as rllib env register_env(gym_name, create_env) return alg_run, gym_name, config
def register_env_creator(self): register_env("NetworkCompression-v1", create_environment)
obs[i], rew[i], done[i], info[i] = [ featurize(step_obs[0][i]), step_obs[1][i], step_obs[1][i] == -1 or step_obs[2], step_obs[3], ] done["__all__"] = step_obs[2] return obs, rew, done, info def reset(self): obs = self.env.reset() return {i: featurize(obs[i]) for i in self.agents_index} register_env("pommer_team", lambda _: MultiAgent()) sys.setrecursionlimit(1000) class PhasePPO(PPOAgent): def __init__(self, config=None, env=None, logger_creator=None): super(PhasePPO, self).__init__(config=config, env=env, logger_creator=logger_creator) self.train_phase = 0 def on_episode_end(info): env = info["env"] episode.custom_metrics["train_phase"] = env.get_phase()
def train(args,parser = None): """Train agent :args: Argparse.args: User-defined arguments """ # Set logging level logging.basicConfig(level= args.log_level, format='%(message)s') # Initialize mail bot if args.email_updates: args.mail_bot = CameleonEmailBot(email_sender = args.email_sender, email_receiver = args.email_receiver, email_server = args.email_server) # Initialize Ray - and try to prevent OOM #Spin up Ray only if it is not already running if args.init_ray: ray.init(object_store_memory = args.ray_obj_store_mem) # Set up environment env = gym.make(args.env_name) # Wrap environment env = wrap_env(env, args.wrappers) # Register environment with Ray register_env(args.env_name, lambda config: env) # Set model and config model, config = str2model(args.model_name, config = True) #Add to config for compute resources config['num_workers'] = args.num_workers config['num_gpus'] = args.num_gpus config['framework'] = args.framework config['seed'] = args.seed _determine_stopping_criteria(args) #Update config if one was passed if args.config: config = update_config(config, args.config) # Update outdir args.outdir_root = args.outdir args.outdir = "{}{}_{}_{}_rs{}_w{}_{}".format(args.outdir, args.model_name, args.framework, args.env_name, args.seed, args.num_workers, dt.datetime.now().strftime("%Y.%m.%d")) args.tune_dirname = "{}_{}_rs{}_w{}_{}".format( args.model_name, args.framework, args.seed, args.num_workers, dt.datetime.now().strftime("%Y.%m.%d")) # Set up agent agent = model(env = args.env_name, config = config, logger_creator=cameleon_logger_creator( args.outdir)) # Change to pretrained model if needed if args.checkpoint_path: agent.restore(args.checkpoint_path) if args.tune: agent = args.model_name # Train the agent train_agent(agent, args, config, tune = args.tune) # Shutdown Ray (ensures fresh start for random seeds) ray.shutdown() # Send email update, if necessary if args.email_updates and not args.failure_message: args.mail_bot.send_email("train_finished", args)
def main(args): ray.init(redis_max_memory=int(ray.utils.get_system_memory() * 0.4), memory=int(ray.utils.get_system_memory() * 0.2), object_store_memory=int(ray.utils.get_system_memory() * 0.2), num_gpus=args.num_gpus, num_cpus=6, temp_dir=args.temp_dir) discrete_action_input = False if args.trainer == 'dqn': trainer = DQNTrainer discrete_action_input = True else: raise Exception('Unknown trainer: "{}"'.format(args.trainer)) def env_creater(mpe_args): return MultiAgentParticleEnv(**mpe_args) register_env("mpe", env_creater) env = env_creater({ "scenario_name": args.scenario, "discrete_action_input": discrete_action_input }) def gen_policy(i): return (None, env.observation_space_dict[i], env.action_space_dict[i], { "agent_id": i, "use_local_critic": False, "obs_space_dict": env.observation_space_dict, "act_space_dict": env.action_space_dict, }) policies = { "policy_%d" % i: gen_policy(i) for i in range(len(env.observation_space_dict)) } policy_ids = list(policies.keys()) def policy_mapping_fn(agent_id): return policy_ids[agent_id] exp_name = "{}{}".format( args.scenario.replace("_", "").replace("-", ""), "_{}".format(args.add_postfix) if args.add_postfix != "" else "") run_experiments( { exp_name: { "run": trainer, "env": "mpe", "stop": { "episodes_total": args.num_episodes, }, "checkpoint_freq": args.checkpoint_freq, "local_dir": args.local_dir, "restore": args.restore, "config": { # === Log === "log_level": "ERROR", # === Environment === "env_config": { "scenario_name": args.scenario, "discrete_action_input": discrete_action_input }, "num_envs_per_worker": args.num_envs_per_worker, "horizon": args.max_episode_len, # === Policy Config === # --- Model --- # "good_policy": args.good_policy, # "adv_policy": args.adv_policy, # "actor_hiddens": [args.num_units] * 2, # "actor_hidden_activation": "relu", # "critic_hiddens": [args.num_units] * 2, # "critic_hidden_activation": "relu", "n_step": args.n_step, "gamma": args.gamma, # --- Exploration --- # "tau": 0.01, # --- Replay buffer --- "buffer_size": args.replay_buffer, # int(10000), # int(1e6) # --- Optimization --- # "actor_lr": args.lr, # "critic_lr": args.lr, "learning_starts": args.train_batch_size * args.max_episode_len, "sample_batch_size": args.sample_batch_size, "train_batch_size": args.train_batch_size, "batch_mode": "truncate_episodes", # --- Parallelism --- "num_workers": args.num_workers, "num_gpus": args.num_gpus, "num_gpus_per_worker": 0, # === Multi-agent setting === "multiagent": { "policies": policies, "policy_mapping_fn": ray.tune.function(policy_mapping_fn) }, }, }, }, verbose=0, reuse_actors=False) # reuse_actors=True - messes up the results
def register_env_creator(self): register_env( "stacked_procgen_env", # This should be different from procgen_env_wrapper lambda config: gym.wrappers.FrameStack(ProcgenEnvWrapper(config), 4 ))
# vehicles to be placed in the network at the start of a rollout (see # flow.core.params.VehicleParams) veh=vehicles, # parameters specifying the positioning of vehicles upon initialization/ # reset (see flow.core.params.InitialConfig) initial=InitialConfig(), ) # SET UP RLLIB MULTI-AGENT FEATURES create_env, env_name = make_create_env(params=flow_params, version=0) # register as rllib env register_env(env_name, create_env) # multiagent configuration test_env = create_env() obs_space = test_env.observation_space act_space = test_env.action_space POLICY_GRAPHS = {'av': (PPOTFPolicy, obs_space, act_space, {})} POLICIES_TO_TRAIN = ['av'] def policy_mapping_fn(_): """Map a policy in RLlib.""" return 'av'
import os import ray from ray.rllib.agents.dqn import DQNAgent from ray.rllib.models import ModelCatalog from ray.tune.logger import pretty_print from ray.tune.registry import register_env from algos.gym_halite import env_creator from algos.model import ParametricActionsModel CHECKPOINT_FILE = "last_checkpoint.out" ray.init(local_mode=True) ModelCatalog.register_custom_model("parametric", ParametricActionsModel) register_env("halite_env", env_creator) dqn = DQNAgent( env="halite_env", config={ "env_config": {}, "num_workers": 1, "num_cpus_per_worker": 1, "num_envs_per_worker": 1, "num_gpus": 1, "hiddens": [], "schedule_max_timesteps": 100000000, "timesteps_per_iteration": 1000, "exploration_fraction": 0.8, "exploration_final_eps": 0.02, "lr": 1e-3, "model": {
else: reward = -1 done = len(self.history) > 100 return self._next_obs(), reward, done, {} def _next_obs(self): token = random.choice([0, 1]) self.history.append(token) return token if __name__ == "__main__": ray.init() args = parser.parse_args() ModelCatalog.register_custom_model("rnn", MyKerasRNN) register_env("RepeatAfterMeEnv", lambda c: RepeatAfterMeEnv(c)) register_env("RepeatInitialEnv", lambda _: RepeatInitialEnv()) tune.run(args.run, stop={"episode_reward_mean": args.stop}, config={ "env": args.env, "env_config": { "repeat_delay": 2, }, "gamma": 0.9, "num_workers": 0, "num_envs_per_worker": 20, "entropy_coeff": 0.001, "num_sgd_iter": 5, "vf_loss_coeff": 1e-5, "model": {