def doTestNestedTuple(self, make_env): ModelCatalog.register_custom_model("composite2", TupleSpyModel) register_env("nested2", make_env) pg = PGAgent( env="nested2", config={ "num_workers": 0, "sample_batch_size": 5, "train_batch_size": 5, "model": { "custom_model": "composite2", }, }) pg.train() # Check that the model sees the correct reconstructed observations for i in range(4): seen = pickle.loads( ray.experimental.internal_kv._internal_kv_get( "t_spy_in_{}".format(i))) pos_i = TUPLE_SAMPLES[i][0].tolist() cam_i = TUPLE_SAMPLES[i][1][0].tolist() task_i = one_hot(TUPLE_SAMPLES[i][2], 5) self.assertEqual(seen[0][0].tolist(), pos_i) self.assertEqual(seen[1][0].tolist(), cam_i) self.assertEqual(seen[2][0].tolist(), task_i)
def testPyTorchModel(self): ModelCatalog.register_custom_model("composite", TorchSpyModel) register_env("nested", lambda _: NestedDictEnv()) a2c = A2CAgent( env="nested", config={ "num_workers": 0, "use_pytorch": True, "sample_batch_size": 5, "train_batch_size": 5, "model": { "custom_model": "composite", }, }) a2c.train() # Check that the model sees the correct reconstructed observations for i in range(4): seen = pickle.loads( ray.experimental.internal_kv._internal_kv_get( "torch_spy_in_{}".format(i))) pos_i = DICT_SAMPLES[i]["sensors"]["position"].tolist() cam_i = DICT_SAMPLES[i]["sensors"]["front_cam"][0].tolist() task_i = one_hot( DICT_SAMPLES[i]["inner_state"]["job_status"]["task"], 5) self.assertEqual(seen[0][0].tolist(), pos_i) self.assertEqual(seen[1][0].tolist(), cam_i) self.assertEqual(seen[2][0].tolist(), task_i)
def doTestNestedDict(self, make_env, test_lstm=False): ModelCatalog.register_custom_model("composite", DictSpyModel) register_env("nested", make_env) pg = PGAgent( env="nested", config={ "num_workers": 0, "sample_batch_size": 5, "train_batch_size": 5, "model": { "custom_model": "composite", "use_lstm": test_lstm, }, }) pg.train() # Check that the model sees the correct reconstructed observations for i in range(4): seen = pickle.loads( ray.experimental.internal_kv._internal_kv_get( "d_spy_in_{}".format(i))) pos_i = DICT_SAMPLES[i]["sensors"]["position"].tolist() cam_i = DICT_SAMPLES[i]["sensors"]["front_cam"][0].tolist() task_i = one_hot( DICT_SAMPLES[i]["inner_state"]["job_status"]["task"], 5) self.assertEqual(seen[0][0].tolist(), pos_i) self.assertEqual(seen[1][0].tolist(), cam_i) self.assertEqual(seen[2][0].tolist(), task_i)
def testInvalidModel(self): ModelCatalog.register_custom_model("invalid", InvalidModel) self.assertRaises(ValueError, lambda: PGAgent( env="CartPole-v0", config={ "model": { "custom_model": "invalid", }, }))
def testCustomModel(self): ray.init() ModelCatalog.register_custom_model("foo", CustomModel) p1 = ModelCatalog.get_model({ "obs": tf.constant([1, 2, 3]) }, Box(0, 1, shape=(3, ), dtype=np.float32), Discrete(5), 5, {"custom_model": "foo"}) self.assertEqual(str(type(p1)), str(CustomModel))
def testMinibatchSequencing(self): ModelCatalog.register_custom_model("rnn", RNNSpyModel) register_env("counter", lambda _: DebugCounterEnv()) ppo = PPOAgent( env="counter", config={ "num_workers": 0, "sample_batch_size": 20, "train_batch_size": 20, "sgd_minibatch_size": 10, "vf_share_layers": True, "simple_optimizer": False, "num_sgd_iter": 1, "model": { "custom_model": "rnn", "max_seq_len": 4, }, }) ppo.train() ppo.train() # first epoch: 20 observations get split into 2 minibatches of 8 # four observations are discarded batch0 = pickle.loads( ray.experimental.internal_kv._internal_kv_get("rnn_spy_in_0")) batch1 = pickle.loads( ray.experimental.internal_kv._internal_kv_get("rnn_spy_in_1")) if batch0["sequences"][0][0][0] > batch1["sequences"][0][0][0]: batch0, batch1 = batch1, batch0 # sort minibatches self.assertEqual(batch0["seq_lens"].tolist(), [4, 4]) self.assertEqual(batch1["seq_lens"].tolist(), [4, 3]) self.assertEqual(batch0["sequences"].tolist(), [ [[0], [1], [2], [3]], [[4], [5], [6], [7]], ]) self.assertEqual(batch1["sequences"].tolist(), [ [[8], [9], [10], [11]], [[12], [13], [14], [0]], ]) # second epoch: 20 observations get split into 2 minibatches of 8 # four observations are discarded batch2 = pickle.loads( ray.experimental.internal_kv._internal_kv_get("rnn_spy_in_2")) batch3 = pickle.loads( ray.experimental.internal_kv._internal_kv_get("rnn_spy_in_3")) if batch2["sequences"][0][0][0] > batch3["sequences"][0][0][0]: batch2, batch3 = batch3, batch2 self.assertEqual(batch2["seq_lens"].tolist(), [4, 4]) self.assertEqual(batch3["seq_lens"].tolist(), [2, 4]) self.assertEqual(batch2["sequences"].tolist(), [ [[5], [6], [7], [8]], [[9], [10], [11], [12]], ]) self.assertEqual(batch3["sequences"].tolist(), [ [[13], [14], [0], [0]], [[0], [1], [2], [3]], ])
def testInvalidModel2(self): ModelCatalog.register_custom_model("invalid2", InvalidModel2) self.assertRaisesRegexp( ValueError, "Expected output.*", lambda: PGAgent( env="CartPole-v0", config={ "model": { "custom_model": "invalid2", }, }))
def testSimpleOptimizerSequencing(self): ModelCatalog.register_custom_model("rnn", RNNSpyModel) register_env("counter", lambda _: DebugCounterEnv()) ppo = PPOAgent( env="counter", config={ "num_workers": 0, "sample_batch_size": 10, "train_batch_size": 10, "sgd_minibatch_size": 10, "vf_share_layers": True, "simple_optimizer": True, "num_sgd_iter": 1, "model": { "custom_model": "rnn", "max_seq_len": 4, }, }) ppo.train() ppo.train() batch0 = pickle.loads( ray.experimental.internal_kv._internal_kv_get("rnn_spy_in_0")) self.assertEqual( batch0["sequences"].tolist(), [[[0], [1], [2], [3]], [[4], [5], [6], [7]], [[8], [9], [0], [0]]]) self.assertEqual(batch0["seq_lens"].tolist(), [4, 4, 2]) self.assertEqual(batch0["state_in"][0][0].tolist(), [0, 0, 0]) self.assertEqual(batch0["state_in"][1][0].tolist(), [0, 0, 0]) self.assertGreater(abs(np.sum(batch0["state_in"][0][1])), 0) self.assertGreater(abs(np.sum(batch0["state_in"][1][1])), 0) self.assertTrue( np.allclose(batch0["state_in"][0].tolist()[1:], batch0["state_out"][0].tolist()[:-1])) self.assertTrue( np.allclose(batch0["state_in"][1].tolist()[1:], batch0["state_out"][1].tolist()[:-1])) batch1 = pickle.loads( ray.experimental.internal_kv._internal_kv_get("rnn_spy_in_1")) self.assertEqual(batch1["sequences"].tolist(), [ [[10], [11], [12], [13]], [[14], [0], [0], [0]], [[0], [1], [2], [3]], [[4], [0], [0], [0]], ]) self.assertEqual(batch1["seq_lens"].tolist(), [4, 1, 4, 1]) self.assertEqual(batch1["state_in"][0][2].tolist(), [0, 0, 0]) self.assertEqual(batch1["state_in"][1][2].tolist(), [0, 0, 0]) self.assertGreater(abs(np.sum(batch1["state_in"][0][0])), 0) self.assertGreater(abs(np.sum(batch1["state_in"][1][0])), 0) self.assertGreater(abs(np.sum(batch1["state_in"][0][1])), 0) self.assertGreater(abs(np.sum(batch1["state_in"][1][1])), 0) self.assertGreater(abs(np.sum(batch1["state_in"][0][3])), 0) self.assertGreater(abs(np.sum(batch1["state_in"][1][3])), 0)
def testMultiAgentComplexSpaces(self): ModelCatalog.register_custom_model("dict_spy", DictSpyModel) ModelCatalog.register_custom_model("tuple_spy", TupleSpyModel) register_env("nested_ma", lambda _: NestedMultiAgentEnv()) act_space = spaces.Discrete(2) pg = PGAgent( env="nested_ma", config={ "num_workers": 0, "sample_batch_size": 5, "train_batch_size": 5, "multiagent": { "policy_graphs": { "tuple_policy": ( PGPolicyGraph, TUPLE_SPACE, act_space, {"model": {"custom_model": "tuple_spy"}}), "dict_policy": ( PGPolicyGraph, DICT_SPACE, act_space, {"model": {"custom_model": "dict_spy"}}), }, "policy_mapping_fn": lambda a: { "tuple_agent": "tuple_policy", "dict_agent": "dict_policy"}[a], }, }) pg.train() for i in range(4): seen = pickle.loads( ray.experimental.internal_kv._internal_kv_get( "d_spy_in_{}".format(i))) pos_i = DICT_SAMPLES[i]["sensors"]["position"].tolist() cam_i = DICT_SAMPLES[i]["sensors"]["front_cam"][0].tolist() task_i = one_hot( DICT_SAMPLES[i]["inner_state"]["job_status"]["task"], 5) self.assertEqual(seen[0][0].tolist(), pos_i) self.assertEqual(seen[1][0].tolist(), cam_i) self.assertEqual(seen[2][0].tolist(), task_i) for i in range(4): seen = pickle.loads( ray.experimental.internal_kv._internal_kv_get( "t_spy_in_{}".format(i))) pos_i = TUPLE_SAMPLES[i][0].tolist() cam_i = TUPLE_SAMPLES[i][1][0].tolist() task_i = one_hot(TUPLE_SAMPLES[i][2], 5) self.assertEqual(seen[0][0].tolist(), pos_i) self.assertEqual(seen[1][0].tolist(), cam_i) self.assertEqual(seen[2][0].tolist(), task_i)
def get_config(args: Args): # num_rollouts = 2 ModelCatalog.register_custom_model("SoftModularActorCriticNet", SoftModularActorCriticNet) ModelCatalog.register_custom_model("SimpleEnsembleActorCriticNet", SimpleEnsembleActorCriticNet) # 1. Gets default training configuration and specifies the POMgame to load. config = deepcopy(get_agent_class(args.alg_name)._default_config) # 2. Set environment config. This will be passed to # the env_creator function via the register env lambda below. # local_ratio specify hthe ratio between global reward and the local reward # config["env_config"] = {"local_ratio": 0.5} def env_creator(): if args.game.__package__.endswith('atari'): if (args.game_name.startswith('foozpong') or args.game_name.startswith('basketball_pong') or args.game_name.startswith('volleyball_pong') ): env = args.game.env(obs_type=args.atari_obs_type, max_cycles=args.max_steps['atari'], full_action_space=False, num_players=2) else: env = args.game.env(obs_type=args.atari_obs_type, full_action_space=False, max_cycles=args.max_steps['atari']) env = frame_skip_v0(env, args.atari_frame_skip_num) env = frame_stack_v1(env, args.atari_frame_stack_num) else: env = args.game.env() if args.game_name.startswith('rps'): env = one_hot_obs_wrapper(env) env = dtype_v0(env, dtype=float32) env = pad_observations_v0(env) env = pad_action_space_v0(env) if args.game_name.startswith('connect_four') or args.game_name.startswith('tictactoe'): env = FlattenEnvWrapper(env) GAUSSIAN_STD = 1.0 assert abs(GAUSSIAN_STD - 1.0) < 1e-5, "must be 1.0, otherwise simple ensemble implementation is wrong" env = LatentGaussianAugmentedEnvWrapper(env, latent_parameter_dim=args.latent_para_dim, gaussian_std=1.0, use_dict_obs_space=args.use_dict_obs_space) return env # 3. Register env, and get trainer_class register_env(args.game_name, lambda config: PettingZooEnv(env_creator())) trainer_class = get_agent_class(args.alg_name) # 4. Extract space dimensions test_env = PettingZooEnv(env_creator()) obs_space = test_env.observation_space act_space = test_env.action_space agents_id = test_env.agents print(f"obs_space: {obs_space}; act_space: {act_space}") # 5. Configuration for multiagent setup: config["framework"] = "torch" config["num_gpus"] = 0 config["log_level"] = "INFO" config["num_workers"] = args.num_cpus // 2 config["num_cpus_per_worker"] = 1 config['num_envs_per_worker'] = 5 # Fragment length, collected at once from each worker and for each agent! config["rollout_fragment_length"] = 100 # Training batch size -> Fragments are concatenated up to this point. config["train_batch_size"] = 2000 config["sgd_minibatch_size"] = 256 config["entropy_coeff"] = 0.01 config["lambda"] = 0.9 config["vf_clip_param"] = 50 config["num_sgd_iter"] = 10 # After n steps, force reset simulation config["horizon"] = args.max_steps[args.game_type] # Default: False config["no_done_at_end"] = False # Info: If False, each agents trajectory is expected to have # maximum one done=True in the last step of the trajectory. # If no_done_at_end = True, environment is not resetted # when dones[__all__]= True. config['ignore_worker_failures'] = True def get_main_and_test_config(config: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]: main_policies = {} for i, agent_id in enumerate(agents_id): for j in range(1): main_policies[f'{agent_id}_{j}'] = (PPOTorchPolicy, obs_space, act_space, {"framework": "torch"}) test_policies = { 'test_' + agent_id: (PPOTorchPolicy, obs_space, act_space, {"framework": "torch"}) for agent_id in agents_id if is_adversary(agent_id) } policies = {**main_policies, **test_policies} main_config, test_config = deepcopy(config), deepcopy(config) main_config["multiagent"] = { "policies": policies, "policy_mapping_fn": lambda agent_id: f'{agent_id}_{0}', "policies_to_train": list(main_policies.keys()) } def test_config_policy_mapping(agent_id: str) -> str: if is_adversary(agent_id): return 'test_' + agent_id return f'{agent_id}_{0}' test_config["multiagent"] = { "policies": policies, "policy_mapping_fn": test_config_policy_mapping, "policies_to_train": list(test_policies.keys()) } return main_config, test_config def get_simple_ensemble_training_config(config: Dict[str, Any], ensemble_size: int=3) -> Tuple[Dict[str, Any], Dict[str, Any]]: if ensemble_size > 1: config["model"] = { "custom_model": "SimpleEnsembleActorCriticNet", "custom_model_config": { "use_dict_obs_space": args.use_dict_obs_space, 'ensemble_size': ensemble_size } } main_config, test_config = get_main_and_test_config(config) return main_config, test_config def get_implicit_ensemble_training_config(config: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]: config["model"] = { "custom_model": "SoftModularActorCriticNet", "custom_model_config": { "use_latent_embedding": args.use_latent_embedding, "use_dict_obs_space": args.use_dict_obs_space, "base_type": MLPBase, "em_input_shape": args.latent_para_dim, "emb_shaping_net_hidden_shapes": args.emb_shaping_net_hidden_shapes, 'emb_shaping_net_last_softmax': args.emb_shaping_net_last_softmax, 'em_hidden_shapes': [args.soft_modular_net_hidden_dim, args.soft_modular_net_hidden_dim], #[400], 'hidden_shapes': [args.soft_modular_net_hidden_dim, args.soft_modular_net_hidden_dim], #[400, 400], 'num_layers': args.soft_modular_net_num_layers, #4, 'num_modules': args.soft_modular_net_num_modules, #4, 'module_hidden': args.soft_modular_net_hidden_dim, #128, 'gating_hidden': args.soft_modular_net_hidden_dim, #256, 'num_gating_layers': 2, #with 1 gating layer, 500 step works for simple_spread 'add_bn': False, } } main_config, test_config = get_main_and_test_config(config) return main_config, test_config if args.train_setting == 'single_policy': main_config, test_config = get_simple_ensemble_training_config(config, ensemble_size=1) elif args.train_setting == 'simple_ensemble': main_config, test_config = get_simple_ensemble_training_config(config, ensemble_size=3) else: assert args.train_setting == 'implicit_ensemble' main_config, test_config = get_implicit_ensemble_training_config(config) return trainer_class, test_env, main_config, test_config
self.base_model = tf.keras.Model(self.inputs, layer_out) # Implement the core forward method. def forward(self, input_dict, state, seq_lens): model_out = self.base_model(input_dict["obs"]) return model_out, state def metrics(self): return {"foo": tf.constant(42.0)} if __name__ == "__main__": args = parser.parse_args() ray.init(num_cpus=args.num_cpus or None) ModelCatalog.register_custom_model( "keras_model", MyVisionNetwork if args.use_vision_network else MyKerasModel) ModelCatalog.register_custom_model( "keras_q_model", MyVisionNetwork if args.use_vision_network else MyKerasQModel) # Tests https://github.com/ray-project/ray/issues/7293 def check_has_custom_metric(result): r = result["result"]["info"]["learner"] if DEFAULT_POLICY_ID in r: r = r[DEFAULT_POLICY_ID].get(LEARNER_STATS_KEY, r[DEFAULT_POLICY_ID]) assert r["model"]["foo"] == 42, result if args.run == "DQN": extra_config = {"learning_starts": 0}
ray.init(local_mode=args.debug) if args.debug: tune_config = { 'log_level': 'DEBUG', 'num_workers': 1, } else: tune_config = { 'num_workers': 1, 'num_gpus': 1, } env_config = {'map_filename': args.map} if args.use_cnn: env_cls = SquarePycroRts3MultiAgentEnv ModelCatalog.register_custom_model('masked_actions_model', MaskedActionsCNN) model_config = { 'custom_model': 'masked_actions_model', 'conv_filters': [[16, [2, 2], 1], [32, [2, 2], 1], [64, [3, 3], 2]], 'conv_activation': 'leaky_relu', 'fcnet_hiddens': [128, 128], 'fcnet_activation': 'leaky_relu', } else: env_cls = PycroRts3MultiAgentEnv ModelCatalog.register_custom_model('masked_actions_model', MaskedActionsMLP) model_config = { 'custom_model': 'masked_actions_model', 'fcnet_hiddens': [128, 128], 'fcnet_activation': 'leaky_relu', }
lstm_out = self.lstm( x, [torch.unsqueeze(state[0], 0), torch.unsqueeze(state[1], 0)]) action_out = self.action_branch(lstm_out[0]) self._cur_value = torch.reshape(self.value_branch(lstm_out[0]), [-1]) return action_out, [ torch.squeeze(lstm_out[1][0], 0), torch.squeeze(lstm_out[1][1], 0) ] if __name__ == "__main__": args = parser.parse_args() ray.init(num_cpus=args.num_cpus or None) ModelCatalog.register_custom_model("rnn", RNNModel) tune.register_env("repeat_initial", lambda _: RepeatInitialEnv(episode_len=100)) tune.register_env("repeat_after_me", lambda _: RepeatAfterMeEnv({"repeat_delay": 1})) tune.register_env("cartpole_stateless", lambda _: CartPoleStatelessEnv()) config = { "env": args.env, "use_pytorch": True, "num_workers": 0, "num_envs_per_worker": 20, "gamma": 0.9, "entropy_coeff": 0.0001, "model": { "custom_model": "rnn",
if config["framework"] == "torch": return CCPPOTorchPolicy CCTrainer = PPOTrainer.with_updates( name="CCPPOTrainer", default_policy=CCPPOTFPolicy, get_policy_class=get_policy_class, ) if __name__ == "__main__": ray.init() args = parser.parse_args() ModelCatalog.register_custom_model( "cc_model", TorchCentralizedCriticModel if args.framework == "torch" else CentralizedCriticModel) config = { "env": TwoStepGame, "batch_mode": "complete_episodes", # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0. "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")), "num_workers": 0, "multiagent": { "policies": { "pol1": (None, Discrete(6), TwoStepGame.action_space, { "framework": args.framework, }), "pol2": (None, Discrete(6), TwoStepGame.action_space, { "framework": args.framework,
def main(): parser = argparse.ArgumentParser() parser.add_argument("--eval-num", type=int, default=5) parser.add_argument("--eval-every", type=int, default=1) parser.add_argument("--num-workers", type=int, default=1) parser.add_argument("--cpus-per-worker", type=float, default=0.5) parser.add_argument("--cpus-for-driver", type=float, default=0.5) parser.add_argument("--address", type=str, default=None) parser.add_argument( "--model-path", type=str, default="/home/jippo/ray_results/YanivTrainer_2021-05-02_16-44-14/YanivTrainer_yaniv_3ee8a_00000_0_2021-05-02_16-44-14/models", ) parser.add_argument("--opponent", type=str, default="intermediate") args = parser.parse_args() register_env("yaniv", lambda config: YanivEnv(config)) ModelCatalog.register_custom_model("yaniv_mask", YanivActionMaskModel) if args.opponent == "intermediate": stepfn = intermediate_rule_step elif args.opponent == "novice": stepfn = novice_rule_step else: raise ValueError("opponent not defined: {}".format(args.opponent)) env_config = { "end_after_n_deck_replacements": 0, "end_after_n_steps": 130, "early_end_reward": 0, "use_scaled_negative_reward": True, "use_scaled_positive_reward": True, "max_negative_reward": -1, "negative_score_cutoff": 30, "single_step": False, "step_reward": 0, "use_unkown_cards_in_state": False, "use_dead_cards_in_state": True, "observation_scheme": 1, "n_players": 2, "state_n_players": 2, "player_step_fn": {"player_1": stepfn}, } env = YanivEnv(env_config) obs_space = env.observation_space act_space = env.action_space config = { "callbacks": YanivCallbacks, "num_gpus": 1, "env": "yaniv", "env_config": env_config, "framework": "torch", "multiagent": { "policies": { "policy_1": (None, obs_space, act_space, {}), }, "policy_mapping_fn": policy_mapping_fn, "policies_to_train": ["policy_1"], }, "model": { "custom_model": "yaniv_mask", "fcnet_hiddens": [512, 512], }, "num_envs_per_worker": 1, "num_cpus_per_worker": args.cpus_per_worker, "num_cpus_for_driver": args.cpus_for_driver, "num_workers": 1, "evaluation_num_workers": args.num_workers, "evaluation_num_episodes": args.eval_num, "evaluation_interval": 1, } ray.init(include_dashboard=False, address=args.address) trainer = A3CTrainer(env="yaniv", config=config) # models_path = "/home/jippo/ray_results/YanivTrainer_2021-05-02_16-44-14/YanivTrainer_yaniv_3ee8a_00000_0_2021-05-02_16-44-14/models" # models_path = "/scratch/student/models" models_path = args.model_path models = os.listdir(models_path) results = [] for model in tqdm(sorted(models)): if not model.startswith("model"): print("idk", model) continue model_num = int(model[6:-4]) if model_num % args.eval_every != 0: continue path = os.path.join(models_path, model) with open(path, "rb") as f: policy = pickle.load(f) trainer.get_policy("policy_1").set_state(policy) metrics = trainer._evaluate() metrics["evaluation"].pop("hist_stats") stats = { k: v for k, v in metrics["evaluation"]["custom_metrics"].items() if k.endswith("mean") } stats["model_number"] = model_num tqdm.write( "model: {: <6}: win_mean: {}, episodes: {}".format( model_num, stats["player_0_win_mean"], metrics["evaluation"]["episodes_this_iter"], ) ) results.append(stats) with open("{}_vs_models_{}.json".format(args.opponent, args.eval_num), "w") as f: json.dump(results, f, indent=4)
class CartpoleModel(Model): def _build_layers_v2(self, input_dict, num_outputs, options): self.model = Sequential() self.model.add(layers.InputLayer(input_tensor=input_dict["obs"], input_shape=(4,))) self.model.add(layers.Dense(4, name='l1', activation='relu')) self.model.add(layers.Dense(10, name='l2', activation='relu')) self.model.add(layers.Dense(10, name='l3', activation='relu')) self.model.add(layers.Dense(10, name='l4', activation='relu')) self.model.add(layers.Dense(2, name='l5', activation='relu')) return self.model.get_layer("l5").output, self.model.get_layer("l4").output ray.init() ModelCatalog.register_custom_model("CartpoleModel", CartpoleModel) CartpoleEnv = gym.make('CartPole-v0') CartpoleEnv=ScaleReward(CartpoleEnv) register_env("CP", lambda _:CartpoleEnv) trainer = a3c.A3CTrainer(env="CP", config={ #"model": {"custom_model": "CartpoleModel"}, #"observation_filter": "MeanStdFilter", #"vf_share_layers": True, }, logger_creator=lambda _:ray.tune.logger.NoopLogger({},None)) if os.path.isfile('weights.pickle'): weights = pickle.load(open("weights.pickle", "rb")) trainer.restore_from_object(weights)
return self.value_module(model_out) # NOTE: customs @override(TorchModelV2) def forward(self, input_dict, state, seq_lens): """ return action logits/scores # return embedding value NOTE: only output embedded output to fit the "compute_q_values" func signature from https://github.com/ray-project/ray/blob/master/rllib/agents/dqn/dqn_torch_policy.py """ x, state = self.get_embeddings(input_dict, state, seq_lens) # logits = self.get_advantages_or_q_values(x)[0] return x, state def get_embeddings(self, input_dict, state, seq_lens, permute=True): """ encode observations """ x = input_dict["obs"].float() if permute: x = x.permute(0, 3, 1, 2) # NHWC => NCHW x = self.encoder(x) return x, state ####################################################################################################### ##################################### Misc ##################################################### ####################################################################################################### # Register model in ModelCatalog ModelCatalog.register_custom_model("baseline_rainbow", BaselineRainbowTorchModel)
initializer=normc_initializer(1.0), activation_fn=activation) def forward(self, input_dict: Dict[str, TensorType], state: List[TensorType], seq_lens: TensorType) -> (TensorType, List[TensorType]): obs = input_dict['obs'] data, privates = obs['data'], obs['privates'] b = privates.shape[0] N = data.shape[1] T = data.shape[2] # lstm # x1 = (td - torch.min(td)) / (torch.max(td) - torch.min(td)) normalize lstm_in = data.permute(0, 2, 1, 3).contiguous().view(b, T, -1) lstm_out = self.lstm_net(lstm_in) # cnn x = torch.cat([privates, lstm_out], dim=1) self._features = self._hidden_layers(x) logits = self._policy_net.forward(self._features) return logits, state def value_function(self): assert self._features is not None return self._value_net.forward(self._features).squeeze(-1) ModelCatalog.register_custom_model("mlstm_net", MLSTM_NET)
# Batch dot product => shape of logits is [BATCH, MAX_ACTIONS]. action_logits = tf.reduce_sum(avail_actions * intent_vector, axis=2) # Mask out invalid actions (use tf.float32.min for stability) inf_mask = tf.maximum(tf.log(action_mask), tf.float32.min) masked_logits = inf_mask + action_logits return masked_logits, last_layer if __name__ == "__main__": args = parser.parse_args() ray.init() ModelCatalog.register_custom_model("pa_model", ParametricActionsModel) register_env("pa_cartpole", lambda _: ParametricActionCartpole(10)) if args.run == "PPO": cfg = { "observation_filter": "NoFilter", # don't filter the action list "vf_share_layers": True, # don't create duplicate value model } elif args.run == "DQN": cfg = { "hiddens": [], # important: don't postprocess the action scores } else: cfg = {} # PG, IMPALA, A2C, etc. run_experiments({ "parametric_cartpole": { "run": args.run,
last_layer = slim.fully_connected( input_dict["obs"], 64, activation_fn=tf.nn.relu, scope="fc1") last_layer = slim.fully_connected( last_layer, 64, activation_fn=tf.nn.relu, scope="fc2") output = slim.fully_connected( last_layer, num_outputs, activation_fn=None, scope="fc_out") return output, last_layer if __name__ == "__main__": args = parser.parse_args() ray.init() # Simple environment with `num_agents` independent cartpole entities register_env("multi_cartpole", lambda _: MultiCartpole(args.num_agents)) ModelCatalog.register_custom_model("model1", CustomModel1) ModelCatalog.register_custom_model("model2", CustomModel2) single_env = gym.make("CartPole-v0") obs_space = single_env.observation_space act_space = single_env.action_space # Each policy can have a different configuration (including custom model) def gen_policy(i): config = { "model": { "custom_model": ["model1", "model2"][i % 2], }, "gamma": random.choice([0.95, 0.99]), } return (PPOPolicyGraph, obs_space, act_space, config)
last_layer = tf.layers.batch_normalization( last_layer, training=input_dict["is_training"]) output = slim.fully_connected( last_layer, num_outputs, weights_initializer=normc_initializer(0.01), activation_fn=None, scope="fc_out") return output, last_layer if __name__ == "__main__": args = parser.parse_args() ray.init() ModelCatalog.register_custom_model("bn_model", BatchNormModel) run_experiments({ "batch_norm_demo": { "run": args.run, "env": "Pendulum-v0" if args.run == "DDPG" else "CartPole-v0", "stop": { "training_iteration": args.num_iters }, "config": { "model": { "custom_model": "bn_model", }, "num_workers": 0, }, }, })
""" def _build_layers_v2(self, input_dict, num_outputs, options): print(input_dict) self.obs_in = input_dict["obs"] self.fcnet = FullyConnectedNetwork( input_dict, self.obs_space, self.action_space, num_outputs, options ) return self.fcnet.outputs, self.fcnet.last_layer if __name__ == "__main__": print("THIS EXPERIMENT HAS NOT BEEN FULLY TESTED") kill_server() ray.init() ModelCatalog.register_custom_model("my_model", CustomModel) tune.run( "PPO", stop={"timesteps_total": 1000000}, checkpoint_freq=1, config={ "env": CarlaEnv, # CarlaEnv,SimpleCorridor, # or "corridor" if registered above "model": {"custom_model": "my_model"}, "lr": grid_search([1e-2, 1e-4, 1e-6]), # try different lrs "num_workers": 4, # parallelism "num_gpus_per_worker": 0.2, "env_config": env_config, }, resume=False, )
import ray from gym.wrappers import TimeLimit from ray import tune from ray.rllib.models import ModelCatalog from ray.tune import register_env from custom_envs.corridor_env import CorridorEnv from custom_models.corridor_net import CorridorNet if __name__ == '__main__': register_env('CorridorEnv', lambda env_config: TimeLimit(CorridorEnv(env_config['length']), max_episode_steps=env_config['length'])) ModelCatalog.register_custom_model('CorridorNet', CorridorNet) ray.init(local_mode=True) tune.run( 'PPO', stop={'episode_reward_mean': 0.9}, config={ 'env': 'CorridorEnv', 'env_config': { 'length': tune.grid_search([5, 10, 50]), }, 'model': { 'custom_model': 'CorridorNet', 'custom_options': {}, }, 'use_pytorch': True,
from ray.rllib.agents.ppo import PPOTrainer from ray.rllib.models import ModelCatalog from training.hierarchical_learning.bomberman_arena_multi_env import BombermanArenaEnv from training.hierarchical_learning.hierarchical_bomberman_multi_env import * from ray import tune from training.hierarchical_learning.arena_callback import MyCallbacks from training.train_with_action_masking_2.tfnet_with_masking import ComplexInputNetwork if __name__ == '__main__': ray.init( _redis_max_memory=1024 * 1024 * 100,num_gpus=1, object_store_memory=10*2**30) env = HierarchicalBombermanMultiEnv([f'agent_{i}_high' for i in range(4)]) ModelCatalog.register_custom_model("custom_model", ComplexInputNetwork) tune.register_env('BomberMan-v0', lambda c: BombermanArenaEnv([f'agent_{i}' for i in range(4)])) def policy_mapping_fn(agent_id): if agent_id.startswith("agent_0"): return "policy_kill" else: return "policy_kill_opp" def train(config, checkpoint_dir=None): trainer = PPOTrainer(config=config, env='BomberMan-v0') #trainer.restore('C:\\Users\\Florian\\ray_results\\PPO_BomberMan-v0_2021-03-22_10-57-05mz9533ge\\checkpoint_000140\\checkpoint-140') iter = 0 #def update_phase(ev):
stride, activation_fn=activation, padding="VALID", scope="fc1") fc2 = slim.conv2d(fc1, num_outputs, [1, 1], activation_fn=None, normalizer_fn=None, scope="fc2") print(fc1, fc2) print(flatten(fc1), flatten(fc2)) # exit(123) return flatten(fc2), flatten(fc1) ModelCatalog.register_custom_model("my_model", MyModelClass) model = { "use_lstm": True, "conv_activation": "elu", "custom_model": "my_model", "dim": 42, "grayscale": True, "zero_mean": False, # Reduced channel depth and kernel size from default
[a1_logits, a2_logits]) self.action_model.summary() self.register_variables(self.action_model.variables) def forward(self, input_dict, state, seq_lens): context, self._value_out = self.base_model(input_dict["obs"]) return context, state def value_function(self): return tf.reshape(self._value_out, [-1]) if __name__ == "__main__": args = parser.parse_args() ray.init(num_cpus=args.num_cpus or None) ModelCatalog.register_custom_model("autoregressive_model", AutoregressiveActionsModel) ModelCatalog.register_custom_action_dist("binary_autoreg_output", BinaryAutoregressiveOutput) tune.run(args.run, stop={"episode_reward_mean": args.stop}, config={ "env": CorrelatedActionsEnv, "gamma": 0.5, "num_gpus": 0, "model": { "custom_model": "autoregressive_model", "custom_action_dist": "binary_autoreg_output", }, })
def persuasive_a3c_conf(rollout_size=10, agents=100, debug_folder=None, eval_folder=None, alpha=0.0001, gamma=0.99): """ https://github.com/ray-project/ray/blob/releases/1.0.0/rllib/agents/trainer.py#L44 https://github.com/ray-project/ray/blob/releases/1.0.0/rllib/agents/a3c/a3c.py#L14 https://github.com/ray-project/ray/blob/releases/1.0.0/rllib/models/catalog.py#L37 """ ModelCatalog.register_custom_model('custom_rrn', RNNModel) ModelCatalog.register_custom_action_dist( "custom_action_distribution", PersuasiveActionDistribution) custom_configuration = DEFAULT_CONFIG custom_configuration['collect_metrics_timeout'] = 86400 # a day custom_configuration['framework'] = 'tf' custom_configuration['ignore_worker_failures'] = True custom_configuration['log_level'] = 'WARN' custom_configuration['monitor'] = True custom_configuration['num_cpus_for_driver'] = 1 custom_configuration['num_cpus_per_worker'] = 1 custom_configuration['num_envs_per_worker'] = 1 custom_configuration['num_gpus_per_worker'] = 1 custom_configuration['num_gpus'] = 1 custom_configuration['num_workers'] = 1 custom_configuration['output'] = debug_folder custom_configuration['remote_env_batch_wait_ms'] = 1000 custom_configuration['remote_worker_envs'] = False custom_configuration['seed'] = 42 custom_configuration['timesteps_per_iteration'] = 1 # === Environment Settings === custom_configuration['batch_mode'] = 'complete_episodes' custom_configuration['gamma'] = gamma custom_configuration['lr'] = alpha custom_configuration['no_done_at_end'] = False # Divide episodes into fragments of this many steps each during rollouts. # Sample batches of this size are collected from rollout workers and # combined into a larger batch of `train_batch_size` for learning. # For example, given rollout_fragment_length=100 and train_batch_size=1000: # 1. RLlib collects 10 fragments of 100 steps each from rollout workers. # 2. These fragments are concatenated and we perform an epoch of SGD. # When using multiple envs per worker, the fragment size is multiplied by # `num_envs_per_worker`. This is since we are collecting steps from # multiple envs in parallel. For example, if num_envs_per_worker=5, then # rollout workers will return experiences in chunks of 5*100 = 500 steps. # The dataflow here can vary per algorithm. For example, PPO further # divides the train batch into minibatches for multi-epoch SGD. custom_configuration['rollout_fragment_length'] = rollout_size # Training batch size, if applicable. Should be >= rollout_fragment_length. # Samples batches will be concatenated together to a batch of this size, # which is then passed to SGD. custom_configuration['train_batch_size'] = rollout_size * agents # === Exploration Settings === # https://github.com/ray-project/ray/blob/releases/1.0.0/rllib/utils/exploration/stochastic_sampling.py # custom_configuration['exploration_config']['type'] = 'StochasticSampling' # https://github.com/ray-project/ray/blob/releases/1.0.0/rllib/utils/exploration/epsilon_greedy.py custom_configuration['exploration_config']['type'] = 'EpsilonGreedy' custom_configuration['exploration_config']['initial_epsilon'] = 1.0 custom_configuration['exploration_config']['final_epsilon'] = 0.0001 # ==================== MODEL - DEFAULT ==================== # custom_configuration['model']['fcnet_hiddens'] = [64, 64] # === Built-in options === # Filter config. List of [out_channels, kernel, stride] for each filter # custom_configuration['model']['conv_filters'] = None # Nonlinearity for built-in convnet # custom_configuration['model']['conv_activation'] = "relu" # Nonlinearity for fully connected net (tanh, relu) # custom_configuration['model']['fcnet_activation'] = "tanh" # Number of hidden layers for fully connected net # custom_configuration['model']['fcnet_hiddens'] = [64, 64] # For DiagGaussian action distributions, make the second half of the model # outputs floating bias variables instead of state-dependent. This only # has an effect is using the default fully connected net. # custom_configuration['model']['free_log_std'] = False # Whether to skip the final linear layer used to resize the hidden layer # outputs to size `num_outputs`. If True, then the last hidden layer # should already match num_outputs. # custom_configuration['model']['no_final_linear'] = False # Whether layers should be shared for the value function. # custom_configuration['model']['vf_share_layers'] = True # == LSTM == # Whether to wrap the model with an LSTM. # custom_configuration['model']['use_lstm'] = False # Max seq len for training the LSTM, defaults to 20. # custom_configuration['model']['max_seq_len'] = 20 # Size of the LSTM cell. # custom_configuration['model']['lstm_cell_size'] = 64 # Whether to feed a_{t-1}, r_{t-1} to LSTM. # custom_configuration['model']['lstm_use_prev_action_reward'] = False # When using modelv1 models with a modelv2 algorithm, you may have to # define the state shape here (e.g., [256, 256]). # custom_configuration['model']['state_shape'] = None # [64, 64] # == Atari == # Whether to enable framestack for Atari envs # custom_configuration['model']['framestack'] = True # Final resized frame dimension # custom_configuration['model']['dim'] = 84 # (deprecated) Converts ATARI frame to 1 Channel Grayscale image # custom_configuration['model']['grayscale'] = False # (deprecated) Changes frame to range from [-1, 1] if true # custom_configuration['model']['zero_mean'] = True # === Options for custom models === # Name of a custom model to use custom_configuration['model']['custom_model'] = 'custom_rrn' # Extra options to pass to the custom classes. # These will be available in the Model's custom_configuration['model']['custom_model_config'] = {} # Name of a custom action distribution to use. # See: https://docs.ray.io/en/releases-1.0.0/rllib-models.html#custom-action-distributions custom_configuration['model']['custom_action_dist'] = 'custom_action_distribution' # == OPTIMIZER == # Arguments to pass to the policy optimizer. These vary by optimizer. # custom_configuration['optimizer'] = {} # == Persuasive A3C == custom_configuration['callbacks'] = PersuasiveCallbacks custom_configuration['min_iter_time_s'] = 5 custom_configuration['use_gae'] = True # === Evaluation Settings === # Evaluate with every `evaluation_interval` training iterations. # The evaluation stats will be reported under the "evaluation" metric key. # Note that evaluation is currently not parallelized, and that for Ape-X # metrics are already only reported for the lowest epsilon workers. custom_configuration['evaluation_interval'] = 5 # Number of episodes to run per evaluation period. If using multiple # evaluation workers, we will run at least this many episodes total. custom_configuration['evaluation_num_episodes'] = 5 # Internal flag that is set to True for evaluation workers. # DEFAUTL: 'in_evaluation': False, # Typical usage is to pass extra args to evaluation env creator # and to disable exploration by computing deterministic actions. # IMPORTANT NOTE: Policy gradient algorithms are able to find the optimal # policy, even if this is a stochastic one. Setting 'explore=False' here # will result in the evaluation workers not using this optimal policy! custom_configuration['evaluation_config']['explore'] = False custom_configuration['evaluation_config']['lr'] = 0 custom_configuration['evaluation_config']['num_gpus_per_worker'] = 0 custom_configuration['evaluation_config']['num_gpus'] = 0 custom_configuration['evaluation_config']['output'] = eval_folder # custom_configuration['evaluation_config']['env_config'] = {...}, # Number of parallel workers to use for evaluation. Note that this is set # to zero by default, which means evaluation will be run in the trainer # process. If you increase this, it will increase the Ray resource usage # of the trainer since evaluation workers are created separately from # rollout workers. custom_configuration['evaluation_num_workers'] = 1 # Customize the evaluation method. This must be a function of signature # (trainer: Trainer, eval_workers: WorkerSet) -> metrics: dict. See the # Trainer._evaluate() method to see the default implementation. The # trainer guarantees all eval workers have the latest policy state before # this function is called. custom_configuration['custom_eval_function'] = None #custom_eval_function return custom_configuration
Tout=tf.float32) return penalty - tf.reduce_mean(action_dist.logp(actions) * rewards) # <class 'ray.rllib.policy.tf_policy_template.MyTFPolicy'> MyTFPolicy = build_tf_policy( name="MyTFPolicy", loss_fn=policy_gradient_loss, ) # <class 'ray.rllib.agents.trainer_template.MyCustomTrainer'> MyTrainer = build_trainer( name="MyCustomTrainer", default_policy=MyTFPolicy, ) if __name__ == "__main__": ray.init() args = parser.parse_args() ModelCatalog.register_custom_model("eager_model", EagerModel) tune.run(MyTrainer, stop={"training_iteration": args.iters}, config={ "env": "CartPole-v0", "num_workers": 0, "model": { "custom_model": "eager_model" }, })
parser = argparse.ArgumentParser() parser.add_argument("--run", type=str, default="PPO") parser.add_argument("--env", type=str, default="RepeatAfterMeEnv") parser.add_argument("--num-cpus", type=int, default=0) parser.add_argument("--as-test", action="store_true") parser.add_argument("--torch", action="store_true") parser.add_argument("--stop-reward", type=float, default=90) parser.add_argument("--stop-iters", type=int, default=100) parser.add_argument("--stop-timesteps", type=int, default=100000) if __name__ == "__main__": args = parser.parse_args() ray.init(num_cpus=args.num_cpus or None) ModelCatalog.register_custom_model( "rnn", TorchRNNModel if args.torch else RNNModel) register_env("RepeatAfterMeEnv", lambda c: RepeatAfterMeEnv(c)) register_env("RepeatInitialObsEnv", lambda _: RepeatInitialObsEnv()) config = { "env": args.env, "env_config": { "repeat_delay": 2, }, "gamma": 0.9, "num_workers": 0, "num_envs_per_worker": 20, "entropy_coeff": 0.001, "num_sgd_iter": 5, "vf_loss_coeff": 1e-5, "model": {
reward = 1 else: reward = -1 done = len(self.history) > 100 return self._next_obs(), reward, done, {} def _next_obs(self): token = random.choice([0, 1]) self.history.append(token) return token if __name__ == "__main__": args = parser.parse_args() ray.init(num_cpus=args.num_cpus or None) ModelCatalog.register_custom_model("rnn", MyKerasRNN) register_env("RepeatAfterMeEnv", lambda c: RepeatAfterMeEnv(c)) register_env("RepeatInitialEnv", lambda _: RepeatInitialEnv()) config = { "env": args.env, "env_config": { "repeat_delay": 2, }, "gamma": 0.9, "num_workers": 0, "num_envs_per_worker": 20, "entropy_coeff": 0.001, "num_sgd_iter": 5, "vf_loss_coeff": 1e-5, "model": {
multi_hunter_trainer = PPOTrainer(MultiHunterEnv, config) for _ in range(100): environment.simulate() result = multi_hunter_trainer.train() result["phase"] = 1 reporter(**result) phase1_time = result["timesteps_total"] state = multi_hunter_trainer.save() multi_hunter_trainer.stop() if __name__ == '__main__': training = True ray.init() ModelCatalog.register_custom_model("DQNModel", DQNModel_Hunter) config_hunter = { "num_gpus": 0, "num_workers": 1, "framework": "torch", "lr": 4e-3, # "lr": tune.grid_search([5e-3, 2e-3, 1e-3, 5e-4]), "gamma": 0.985, # "gamma": tune.grid_search([0.983, 0.985, 0.986, 0.987, 0.988, 0.989]), "epsilon": 1, "epsilon_decay": 0.99998, "epsilon_min": 0.01, "buffer_size": 20000, "batch_size": 2000, "env": MultiHunterEnv, "env_config": {
parser = argparse.ArgumentParser() parser.add_argument("--run", type=str, default="PPO") parser.add_argument("--torch", action="store_true") parser.add_argument("--as-test", action="store_true") parser.add_argument("--stop-iters", type=int, default=200) parser.add_argument("--stop-reward", type=float, default=150.0) parser.add_argument("--stop-timesteps", type=int, default=100000) if __name__ == "__main__": args = parser.parse_args() ray.init() register_env("pa_cartpole", lambda _: ParametricActionsCartPole(10)) ModelCatalog.register_custom_model( "pa_model", TorchParametricActionsModel if args.torch else ParametricActionsModel) if args.run == "DQN": cfg = { # TODO(ekl) we need to set these to prevent the masked values # from being further processed in DistributionalQModel, which # would mess up the masking. It is possible to support these if we # defined a a custom DistributionalQModel that is aware of masking. "hiddens": [], "dueling": False, } else: cfg = {} config = dict(
name="CCPPO", postprocess_fn=centralized_critic_postprocessing, loss_fn=loss_with_central_critic, before_loss_init=setup_mixins, grad_stats_fn=central_vf_stats, mixins=[ LearningRateSchedule, EntropyCoeffSchedule, KLCoeffMixin, CentralizedValueMixin ]) CCTrainer = PPOTrainer.with_updates( name="CCPPOTrainer", default_policy=CCPPO, get_policy_class=None) if __name__ == "__main__": args = parser.parse_args() ModelCatalog.register_custom_model("cc_model", CentralizedCriticModel) tune.run( CCTrainer, stop={ "timesteps_total": args.stop, "episode_reward_mean": 7.99, }, config={ "env": TwoStepGame, "batch_mode": "complete_episodes", "eager": False, "num_workers": 0, "multiagent": { "policies": { "pol1": (None, Discrete(6), TwoStepGame.action_space, {}), "pol2": (None, Discrete(6), TwoStepGame.action_space, {}),
activation_fn=tf.nn.relu, scope="fc2") output = slim.fully_connected(last_layer, num_outputs, activation_fn=None, scope="fc_out") return output, last_layer if __name__ == "__main__": args = parser.parse_args() ray.init() # Simple environment with `num_agents` independent cartpole entities register_env("multi_cartpole", lambda _: MultiCartpole(args.num_agents)) ModelCatalog.register_custom_model("model1", CustomModel1) ModelCatalog.register_custom_model("model2", CustomModel2) single_env = gym.make("CartPole-v0") obs_space = single_env.observation_space act_space = single_env.action_space # Each policy can have a different configuration (including custom model) def gen_policy(i): config = { "model": { "custom_model": ["model1", "model2"][i % 2], }, "gamma": random.choice([0.95, 0.99]), } return (PPOPolicyGraph, obs_space, act_space, config)
env = clip_reward(env, lower_bound=-1, upper_bound=1) env = sticky_actions(env, repeat_action_probability=0.25) env = resize(env, 84, 84) #env = color_reduction(env, mode='full') #env = frame_skip(env, 4) env = frame_stack(env, 4) env = agent_indicator(env, type_only=False) return env register_env(env_name, lambda config: PettingZooEnv(env_creator(config))) test_env = PettingZooEnv(env_creator({})) obs_space = test_env.observation_space act_space = test_env.action_space ModelCatalog.register_custom_model("AtariModel", AtariModel) #ModelCatalog.register_custom_model("RandomPolicy", RandomPolicy) def gen_policy(i): config = { "model": { "custom_model": "AtariModel", }, "gamma": 0.99, } return (None, obs_space, act_space, config) policies = { "policy_0": gen_policy(0), "random": (RandomPolicy, obs_space, act_space, {"ignore_action_bounds": True}), }
def main(args): # ==================================== # init env config # ==================================== if args.no_debug: ray.init() else: ray.init(local_mode=True) # use ray cluster for training # ray.init( # address="auto" if args.address is None else args.address, # redis_password="******", # ) # # print( # "--------------- Ray startup ------------\n{}".format( # ray.state.cluster_resources() # ) # ) agent_specs = {"AGENT-007": agent_spec} env_config = { "seed": 42, "scenarios": [scenario_paths], "headless": args.headless, "agent_specs": agent_specs, } # ==================================== # init tune config # ==================================== class MultiEnv(RLlibHiWayEnv): def __init__(self, env_config): env_config["scenarios"] = [ scenario_paths[(env_config.worker_index - 1) % len(scenario_paths)] ] super(MultiEnv, self).__init__(config=env_config) ModelCatalog.register_custom_model("my_rnn", RNNModel) tune_config = { "env": MultiEnv, "env_config": env_config, "multiagent": { "policies": { "default_policy": (None, OBSERVATION_SPACE, ACTION_SPACE, {},) }, "policy_mapping_fn": lambda agent_id: "default_policy", }, "model": { "custom_model": "my_rnn", }, "framework": "torch", "callbacks": { "on_episode_start": on_episode_start, "on_episode_step": on_episode_step, "on_episode_end": on_episode_end, }, "lr": 1e-4, "log_level": "WARN", "num_workers": args.num_workers, "horizon": args.horizon, "train_batch_size": 10240 * 3, "observation_filter": "MeanStdFilter", "batch_mode": "complete_episodes", "grad_clip": 0.5, # "model":{ # "use_lstm": True, # }, } tune_config.update( { "lambda": 0.95, "clip_param": 0.2, "num_sgd_iter": 10, "sgd_minibatch_size": 1024, } ) # ==================================== # init log and checkpoint dir_info # ==================================== experiment_name = EXPERIMENT_NAME.format( scenario="multi_scenarios", algorithm="PPO", n_agent=1, ) log_dir = Path(args.log_dir).expanduser().absolute() / RUN_NAME log_dir.mkdir(parents=True, exist_ok=True) print(f"Checkpointing at {log_dir}") if args.restore: restore_path = Path(args.restore).expanduser() print(f"Loading model from {restore_path}") else: restore_path = None # run experiments analysis = tune.run( PPOTrainer, # "PPO", name=experiment_name, stop={"time_total_s": 24 * 60 * 60}, checkpoint_freq=2, checkpoint_at_end=True, local_dir=str(log_dir), resume=args.resume, restore=restore_path, max_failures=1000, export_formats=["model", "checkpoint"], config=tune_config, ) print(analysis.dataframe().head())
metrics = summarize_episodes(episodes) eval_metrics.append(metrics) return metrics if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--num-iters", type=int, default=10) parser.add_argument("--num-workers", type=int, default=2) args = parser.parse_args() ray.init() register_env("dominion", lambda config: DominionEnv(config)) ModelCatalog.register_custom_model("domraymodel", DomrayModel) config = { "env": DominionEnv, "env_config": env_config, "num_gpus": 1, "train_batch_size": 200, "model": { "custom_model": "domraymodel", "fcnet_hiddens": [256, 256, 34], #TODO: 34 is the action space size, refactor "vf_share_layers": True, }, "callbacks": DomCallbacks, # Evaluation settings
def register_actor_mask_model(): ModelCatalog.register_custom_model("action_mask", ActionMaskModel)
tf1, tf, tfv = try_import_tf() parser = argparse.ArgumentParser() parser.add_argument("--run", type=str, default="PPO") parser.add_argument("--as-test", action="store_true") parser.add_argument("--torch", action="store_true") parser.add_argument("--stop-iters", type=int, default=200) parser.add_argument("--stop-timesteps", type=int, default=100000) parser.add_argument("--stop-reward", type=float, default=150) if __name__ == "__main__": args = parser.parse_args() ray.init() ModelCatalog.register_custom_model( "bn_model", TorchBatchNormModel if args.torch else KerasBatchNormModel if args.run != "PPO" else BatchNormModel) config = { "env": "Pendulum-v0" if args.run in ["DDPG", "SAC"] else "CartPole-v0", "model": { "custom_model": "bn_model", }, "lr": 0.0003, # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0. "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")), "num_workers": 0, "framework": "torch" if args.torch else "tf", } stop = {
shape=(84, 84, 4), dtype=np.float32) self.i = 0 def reset(self): self.i = 0 return self.zeros def step(self, action): self.i += 1 return self.zeros, 1, self.i > 1000, {} if __name__ == "__main__": ray.init() ModelCatalog.register_custom_model("fast_model", FastModel) run_experiments({ "demo": { "run": "IMPALA", "env": FastImageEnv, "config": { "compress_observations": True, "model": { "custom_model": "fast_model" }, "num_gpus": 0, "num_workers": 2, "num_envs_per_worker":
tf1, tf, tfv = try_import_tf() parser = argparse.ArgumentParser() parser.add_argument("--run", type=str, default="PPO") parser.add_argument("--as-test", action="store_true") parser.add_argument("--torch", action="store_true") parser.add_argument("--stop-iters", type=int, default=200) parser.add_argument("--stop-timesteps", type=int, default=100000) parser.add_argument("--stop-reward", type=float, default=150) if __name__ == "__main__": args = parser.parse_args() ray.init() ModelCatalog.register_custom_model( "bn_model", TorchBatchNormModel if args.torch else BatchNormModel) config = { "env": "Pendulum-v0" if args.run == "DDPG" else "CartPole-v0", "model": { "custom_model": "bn_model", }, "num_workers": 0, "framework": "torch" if args.torch else "tf", } stop = { "training_iteration": args.stop_iters, "timesteps_total": args.stop_timesteps, "episode_reward_mean": args.stop_reward, }
def forward_rnn(self, inputs, state, seq_lens): model_out, self._value_out, h, c = self.rnn_model([inputs, seq_lens] + state) return model_out, [h, c] def get_initial_state(self): return [ np.zeros(self.cell_size, np.float32), np.zeros(self.cell_size, np.float32), ] @override(ModelV2) def value_function(self): return tf.reshape(self._value_out, [-1]) ModelCatalog.register_custom_model("my_model", MyKerasModel) analysis=tune.run(PPOTrainer, stop={"timesteps_total": 100000}, config={ "env": BlackjackEnv, "gamma": 0.99, "num_workers": 1, "num_envs_per_worker": 8, "entropy_coeff": 0.001, "num_sgd_iter": 5, "vf_loss_coeff": 1e-5, "lr":tune.grid_search([0.0001,0.0005,0.00001,0.00005]), "model": { "custom_model": "my_model",
def testCustomModel(self): ray.init() ModelCatalog.register_custom_model("foo", CustomModel) p1 = ModelCatalog.get_model( get_registry(), 1, 5, {"custom_model": "foo"}) self.assertEqual(str(type(p1)), str(CustomModel))
self.imitation_loss = tf.reduce_mean( -action_dist.logp(input_ops["actions"])) return policy_loss + 10 * self.imitation_loss def custom_stats(self): return { "policy_loss": self.policy_loss, "imitation_loss": self.imitation_loss, } if __name__ == "__main__": ray.init() args = parser.parse_args() ModelCatalog.register_custom_model("custom_loss", CustomLossModel) run_experiments({ "custom_loss": { "run": "PG", "env": "CartPole-v0", "stop": { "training_iteration": args.iters, }, "config": { "num_workers": 0, "model": { "custom_model": "custom_loss", "custom_options": { "input_files": args.input_files, }, },