def observation(self, obs): # Debug output: max-x/y positions to watch exploration progress. if self.step_count == 0: if self.x_positions: # max_diff = max( # np.sqrt((np.array(self.x_positions) - self.init_x) ** 2 + ( # np.array(self.y_positions) - self.init_y) ** 2)) # print("After reset: max delta-x/y={}".format(max_diff)) self.x_positions = [] self.y_positions = [] self.init_x = self.agent_pos[0] self.init_y = self.agent_pos[1] # Are we carrying the key? if self.carrying is not None: print("Carrying KEY!!") self.x_positions.append(self.agent_pos[0]) self.y_positions.append(self.agent_pos[1]) # One-hot the last dim into 11, 6, 3 one-hot vectors, then flatten. objects = one_hot(obs[:, :, 0], depth=11) colors = one_hot(obs[:, :, 1], depth=6) states = one_hot(obs[:, :, 2], depth=3) # Is the door we see open? for x in range(7): for y in range(7): if objects[x, y, 4] == 1.0 and states[x, y, 0] == 1.0: print("Door OPEN!!") all_ = np.concatenate([objects, colors, states], -1) ret = np.reshape(all_, (-1, )) direction = one_hot( np.array(self.agent_dir), depth=4).astype(np.float32) return np.concatenate([ret, direction])
def test_multi_agent_sample_round_robin(self): ev = RolloutWorker( env_creator=lambda _: RoundRobinMultiAgent(5, increment_obs=True), policy_spec={ "p0": PolicySpec(policy_class=MockPolicy), }, policy_mapping_fn=lambda agent_id, episode, **kwargs: "p0", rollout_fragment_length=50, ) batch = ev.sample() self.assertEqual(batch.count, 50) # since we round robin introduce agents into the env, some of the env # steps don't count as proper transitions self.assertEqual(batch.policy_batches["p0"].count, 42) check( batch.policy_batches["p0"]["obs"][:10], one_hot(np.array([0, 1, 2, 3, 4] * 2), 10), ) check( batch.policy_batches["p0"]["new_obs"][:10], one_hot(np.array([1, 2, 3, 4, 5] * 2), 10), ) self.assertEqual( batch.policy_batches["p0"]["rewards"].tolist()[:10], [100, 100, 100, 100, 0] * 2, ) self.assertEqual( batch.policy_batches["p0"]["dones"].tolist()[:10], [False, False, False, False, True] * 2, ) self.assertEqual( batch.policy_batches["p0"]["t"].tolist()[:10], [4, 9, 14, 19, 24, 5, 10, 15, 20, 25], )
def test_multi_agent_complex_spaces(self): ModelCatalog.register_custom_model("dict_spy", DictSpyModel) ModelCatalog.register_custom_model("tuple_spy", TupleSpyModel) register_env("nested_ma", lambda _: NestedMultiAgentEnv()) act_space = spaces.Discrete(2) pg = PGTrainer(env="nested_ma", config={ "num_workers": 0, "rollout_fragment_length": 5, "train_batch_size": 5, "multiagent": { "policies": { "tuple_policy": (PGTFPolicy, TUPLE_SPACE, act_space, { "model": { "custom_model": "tuple_spy" } }), "dict_policy": (PGTFPolicy, DICT_SPACE, act_space, { "model": { "custom_model": "dict_spy" } }), }, "policy_mapping_fn": lambda a: { "tuple_agent": "tuple_policy", "dict_agent": "dict_policy" }[a], }, "framework": "tf", }) # Skip first passes as they came from the TorchPolicy loss # initialization. TupleSpyModel.capture_index = DictSpyModel.capture_index = 0 pg.train() for i in range(4): seen = pickle.loads( ray.experimental.internal_kv._internal_kv_get( "d_spy_in_{}".format(i))) pos_i = DICT_SAMPLES[i]["sensors"]["position"].tolist() cam_i = DICT_SAMPLES[i]["sensors"]["front_cam"][0].tolist() task_i = one_hot( DICT_SAMPLES[i]["inner_state"]["job_status"]["task"], 5) self.assertEqual(seen[0][0].tolist(), pos_i) self.assertEqual(seen[1][0].tolist(), cam_i) check(seen[2][0], task_i) for i in range(4): seen = pickle.loads( ray.experimental.internal_kv._internal_kv_get( "t_spy_in_{}".format(i))) pos_i = TUPLE_SAMPLES[i][0].tolist() cam_i = TUPLE_SAMPLES[i][1][0].tolist() task_i = one_hot(TUPLE_SAMPLES[i][2], 5) self.assertEqual(seen[0][0].tolist(), pos_i) self.assertEqual(seen[1][0].tolist(), cam_i) check(seen[2][0], task_i)
def test_py_torch_model(self): ModelCatalog.register_custom_model("composite", TorchSpyModel) register_env("nested", lambda _: NestedDictEnv()) a2c = A2CTrainer(env="nested", config={ "num_workers": 0, "rollout_fragment_length": 5, "train_batch_size": 5, "model": { "custom_model": "composite", }, "framework": "torch", }) # Skip first passes as they came from the TorchPolicy loss # initialization. TorchSpyModel.capture_index = 0 a2c.train() # Check that the model sees the correct reconstructed observations for i in range(4): seen = pickle.loads( ray.experimental.internal_kv._internal_kv_get( "torch_spy_in_{}".format(i))) pos_i = DICT_SAMPLES[i]["sensors"]["position"].tolist() cam_i = DICT_SAMPLES[i]["sensors"]["front_cam"][0].tolist() task_i = one_hot( DICT_SAMPLES[i]["inner_state"]["job_status"]["task"], 5) # Only look at the last entry (-1) in `seen` as we reset (re-use) # the ray-kv indices before training. self.assertEqual(seen[0][-1].tolist(), pos_i) self.assertEqual(seen[1][-1].tolist(), cam_i) check(seen[2][-1], task_i)
def do_test_nested_tuple(self, make_env): ModelCatalog.register_custom_model("composite2", TupleSpyModel) register_env("nested2", make_env) pg = PGTrainer(env="nested2", config={ "num_workers": 0, "rollout_fragment_length": 5, "train_batch_size": 5, "model": { "custom_model": "composite2", }, "framework": "tf", }) # Skip first passes as they came from the TorchPolicy loss # initialization. TupleSpyModel.capture_index = 0 pg.train() # Check that the model sees the correct reconstructed observations for i in range(4): seen = pickle.loads( ray.experimental.internal_kv._internal_kv_get( "t_spy_in_{}".format(i))) pos_i = TUPLE_SAMPLES[i][0].tolist() cam_i = TUPLE_SAMPLES[i][1][0].tolist() task_i = one_hot(TUPLE_SAMPLES[i][2], 5) self.assertEqual(seen[0][0].tolist(), pos_i) self.assertEqual(seen[1][0].tolist(), cam_i) check(seen[2][0], task_i)
def do_test_nested_dict(self, make_env, test_lstm=False): ModelCatalog.register_custom_model("composite", DictSpyModel) register_env("nested", make_env) pg = PGTrainer(env="nested", config={ "num_workers": 0, "rollout_fragment_length": 5, "train_batch_size": 5, "model": { "custom_model": "composite", "use_lstm": test_lstm, }, "framework": "tf", }) # Skip first passes as they came from the TorchPolicy loss # initialization. DictSpyModel.capture_index = 0 pg.train() # Check that the model sees the correct reconstructed observations for i in range(4): seen = pickle.loads( ray.experimental.internal_kv._internal_kv_get( "d_spy_in_{}".format(i))) pos_i = DICT_SAMPLES[i]["sensors"]["position"].tolist() cam_i = DICT_SAMPLES[i]["sensors"]["front_cam"][0].tolist() task_i = one_hot( DICT_SAMPLES[i]["inner_state"]["job_status"]["task"], 5) self.assertEqual(seen[0][0].tolist(), pos_i) self.assertEqual(seen[1][0].tolist(), cam_i) check(seen[2][0], task_i)
def observation(self, obs): # Debug output: max-x/y positions to watch exploration progress. if self.step_count == 0: for _ in range(self.framestack): self.frame_buffer.append(np.zeros((self.single_frame_dim, ))) if self.vector_index == 0: if self.x_positions: max_diff = max( np.sqrt((np.array(self.x_positions) - self.init_x)**2 + (np.array(self.y_positions) - self.init_y)**2)) self.x_y_delta_buffer.append(max_diff) print("100-average dist travelled={}".format( np.mean(self.x_y_delta_buffer))) self.x_positions = [] self.y_positions = [] self.init_x = self.agent_pos[0] self.init_y = self.agent_pos[1] # Are we carrying the key? # if self.carrying is not None: # print("Carrying KEY!!") self.x_positions.append(self.agent_pos[0]) self.y_positions.append(self.agent_pos[1]) # One-hot the last dim into 11, 6, 3 one-hot vectors, then flatten. objects = one_hot(obs[:, :, 0], depth=11) colors = one_hot(obs[:, :, 1], depth=6) states = one_hot(obs[:, :, 2], depth=3) # Is the door we see open? # for x in range(7): # for y in range(7): # if objects[x, y, 4] == 1.0 and states[x, y, 0] == 1.0: # print("Door OPEN!!") all_ = np.concatenate([objects, colors, states], -1) all_flat = np.reshape(all_, (-1, )) direction = one_hot(np.array(self.agent_dir), depth=4).astype(np.float32) single_frame = np.concatenate([all_flat, direction]) self.frame_buffer.append(single_frame) return np.concatenate(self.frame_buffer)
def test_simple_q_loss_function(self): """Tests the Simple-Q loss function results on all frameworks.""" config = dqn.simple_q.SimpleQConfig().rollouts(num_rollout_workers=0) # Use very simple net (layer0=10 nodes, q-layer=2 nodes (2 actions)). config.training(model={ "fcnet_hiddens": [10], "fcnet_activation": "linear", }) for fw in framework_iterator(config): # Generate Trainer and get its default Policy object. trainer = dqn.SimpleQTrainer(config=config, env="CartPole-v0") policy = trainer.get_policy() # Batch of size=2. input_ = SampleBatch({ SampleBatch.CUR_OBS: np.random.random(size=(2, 4)), SampleBatch.ACTIONS: np.array([0, 1]), SampleBatch.REWARDS: np.array([0.4, -1.23]), SampleBatch.DONES: np.array([False, False]), SampleBatch.NEXT_OBS: np.random.random(size=(2, 4)), SampleBatch.EPS_ID: np.array([1234, 1234]), SampleBatch.AGENT_INDEX: np.array([0, 0]), SampleBatch.ACTION_LOGP: np.array([-0.1, -0.1]), SampleBatch.ACTION_DIST_INPUTS: np.array([[0.1, 0.2], [-0.1, -0.2]]), SampleBatch.ACTION_PROB: np.array([0.1, 0.2]), "q_values": np.array([[0.1, 0.2], [0.2, 0.1]]), }) # Get model vars for computing expected model outs (q-vals). # 0=layer-kernel; 1=layer-bias; 2=q-val-kernel; 3=q-val-bias vars = policy.get_weights() if isinstance(vars, dict): vars = list(vars.values()) vars_t = policy.target_model.variables() if fw == "tf": vars_t = policy.get_session().run(vars_t) # Q(s,a) outputs. q_t = np.sum( one_hot(input_[SampleBatch.ACTIONS], 2) * fc( fc( input_[SampleBatch.CUR_OBS], vars[0 if fw != "torch" else 2], vars[1 if fw != "torch" else 3], framework=fw, ), vars[2 if fw != "torch" else 0], vars[3 if fw != "torch" else 1], framework=fw, ), 1, ) # max[a'](Qtarget(s',a')) outputs. q_target_tp1 = np.max( fc( fc( input_[SampleBatch.NEXT_OBS], vars_t[0 if fw != "torch" else 2], vars_t[1 if fw != "torch" else 3], framework=fw, ), vars_t[2 if fw != "torch" else 0], vars_t[3 if fw != "torch" else 1], framework=fw, ), 1, ) # TD-errors (Bellman equation). td_error = q_t - config.gamma * input_[ SampleBatch.REWARDS] + q_target_tp1 # Huber/Square loss on TD-error. expected_loss = huber_loss(td_error).mean() if fw == "torch": input_ = policy._lazy_tensor_dict(input_) # Get actual out and compare. if fw == "tf": out = policy.get_session().run( policy._loss, feed_dict=policy._get_loss_inputs_dict(input_, shuffle=False), ) else: out = (loss_torch if fw == "torch" else loss_tf)(policy, policy.model, None, input_) check(out, expected_loss, decimals=1)
def do_test_log_likelihood(run, config, prev_a=None, continuous=False, layer_key=("fc", (0, 4), ("_hidden_layers.0.", "_logits.")), logp_func=None): config = config.copy() # Run locally. config["num_workers"] = 0 # Env setup. if continuous: env = "Pendulum-v0" obs_batch = preprocessed_obs_batch = np.array([[0.0, 0.1, -0.1]]) else: env = "FrozenLake-v0" config["env_config"] = {"is_slippery": False, "map_name": "4x4"} obs_batch = np.array([0]) preprocessed_obs_batch = one_hot(obs_batch, depth=16) prev_r = None if prev_a is None else np.array(0.0) # Test against all frameworks. for fw in framework_iterator(config): if run in [sac.SACTrainer] and fw == "tfe": continue trainer = run(config=config, env=env) policy = trainer.get_policy() vars = policy.get_weights() # Sample n actions, then roughly check their logp against their # counts. num_actions = 1000 if not continuous else 50 actions = [] for _ in range(num_actions): # Single action from single obs. actions.append( trainer.compute_action(obs_batch[0], prev_action=prev_a, prev_reward=prev_r, explore=True)) # Test all taken actions for their log-likelihoods vs expected values. if continuous: for idx in range(num_actions): a = actions[idx] if fw != "torch": if isinstance(vars, list): expected_mean_logstd = fc( fc(obs_batch, vars[layer_key[1][0]]), vars[layer_key[1][1]]) else: expected_mean_logstd = fc( fc( obs_batch, vars["default_policy/{}_1/kernel".format( layer_key[0])]), vars["default_policy/{}_out/kernel".format( layer_key[0])]) else: expected_mean_logstd = fc( fc(obs_batch, vars["{}_model.0.weight".format(layer_key[2][0])], framework=fw), vars["{}_model.0.weight".format(layer_key[2][1])], framework=fw) mean, log_std = np.split(expected_mean_logstd, 2, axis=-1) if logp_func is None: expected_logp = np.log(norm.pdf(a, mean, np.exp(log_std))) else: expected_logp = logp_func(mean, log_std, a) logp = policy.compute_log_likelihoods( np.array([a]), preprocessed_obs_batch, prev_action_batch=np.array([prev_a]), prev_reward_batch=np.array([prev_r])) check(logp, expected_logp[0], rtol=0.2) # Test all available actions for their logp values. else: for a in [0, 1, 2, 3]: count = actions.count(a) expected_prob = count / num_actions logp = policy.compute_log_likelihoods( np.array([a]), preprocessed_obs_batch, prev_action_batch=np.array([prev_a]), prev_reward_batch=np.array([prev_r])) check(np.exp(logp), expected_prob, atol=0.2)
def test_simple_q_loss_function(self): """Tests the Simple-Q loss function results on all frameworks.""" config = dqn.SIMPLE_Q_DEFAULT_CONFIG.copy() # Run locally. config["num_workers"] = 0 # Use very simple net (layer0=10 nodes, q-layer=2 nodes (2 actions)). config["model"]["fcnet_hiddens"] = [10] config["model"]["fcnet_activation"] = "linear" for fw in framework_iterator(config): # Generate Trainer and get its default Policy object. trainer = dqn.SimpleQTrainer(config=config, env="CartPole-v0") policy = trainer.get_policy() # Batch of size=2. input_ = { SampleBatch.CUR_OBS: np.random.random(size=(2, 4)), SampleBatch.ACTIONS: np.array([0, 1]), SampleBatch.REWARDS: np.array([0.4, -1.23]), SampleBatch.DONES: np.array([False, False]), SampleBatch.NEXT_OBS: np.random.random(size=(2, 4)) } # Get model vars for computing expected model outs (q-vals). # 0=layer-kernel; 1=layer-bias; 2=q-val-kernel; 3=q-val-bias vars = policy.get_weights() if isinstance(vars, dict): vars = list(vars.values()) vars_t = policy.target_q_func_vars if fw == "tf": vars_t = policy.get_session().run(vars_t) # Q(s,a) outputs. q_t = np.sum( one_hot(input_[SampleBatch.ACTIONS], 2) * fc(fc(input_[SampleBatch.CUR_OBS], vars[0 if fw != "torch" else 2], vars[1 if fw != "torch" else 3], framework=fw), vars[2 if fw != "torch" else 0], vars[3 if fw != "torch" else 1], framework=fw), 1) # max[a'](Qtarget(s',a')) outputs. q_target_tp1 = np.max( fc(fc(input_[SampleBatch.NEXT_OBS], vars_t[0 if fw != "torch" else 2], vars_t[1 if fw != "torch" else 3], framework=fw), vars_t[2 if fw != "torch" else 0], vars_t[3 if fw != "torch" else 1], framework=fw), 1) # TD-errors (Bellman equation). td_error = q_t - config["gamma"] * input_[SampleBatch.REWARDS] + \ q_target_tp1 # Huber/Square loss on TD-error. expected_loss = huber_loss(td_error).mean() if fw == "torch": input_ = policy._lazy_tensor_dict(input_) # Get actual out and compare. if fw == "tf": out = policy.get_session().run( policy._loss, feed_dict=policy._get_loss_inputs_dict(input_, shuffle=False)) else: out = (loss_torch if fw == "torch" else loss_tf)(policy, policy.model, None, input_) check(out, expected_loss, decimals=1)
def test_log_likelihood(run, config, prev_a=None, continuous=False, layer_key=("fc", (0, 4)), logp_func=None): config = config.copy() # Run locally. config["num_workers"] = 0 # Env setup. if continuous: env = "Pendulum-v0" obs_batch = preprocessed_obs_batch = np.array([[0.0, 0.1, -0.1]]) else: env = "FrozenLake-v0" config["env_config"] = {"is_slippery": False, "map_name": "4x4"} obs_batch = np.array([0]) preprocessed_obs_batch = one_hot(obs_batch, depth=16) # Use Soft-Q for DQNs. if run is dqn.DQNTrainer: config["exploration_config"] = {"type": "SoftQ", "temperature": 0.5} prev_r = None if prev_a is None else np.array(0.0) # Test against all frameworks. for fw in ["tf", "eager", "torch"]: if run in [dqn.DQNTrainer, sac.SACTrainer] and fw == "torch": continue print("Testing {} with framework={}".format(run, fw)) config["eager"] = True if fw == "eager" else False config["use_pytorch"] = True if fw == "torch" else False trainer = run(config=config, env=env) policy = trainer.get_policy() vars = policy.get_weights() # Sample n actions, then roughly check their logp against their # counts. num_actions = 500 actions = [] for _ in range(num_actions): # Single action from single obs. actions.append( trainer.compute_action(obs_batch[0], prev_action=prev_a, prev_reward=prev_r, explore=True)) # Test 50 actions for their log-likelihoods vs expected values. if continuous: for idx in range(50): a = actions[idx] if fw == "tf" or fw == "eager": if isinstance(vars, list): expected_mean_logstd = fc( fc(obs_batch, vars[layer_key[1][0]]), vars[layer_key[1][1]]) else: expected_mean_logstd = fc( fc( obs_batch, vars["default_policy/{}_1/kernel".format( layer_key[0])]), vars["default_policy/{}_out/kernel".format( layer_key[0])]) else: expected_mean_logstd = fc( fc(obs_batch, vars["_hidden_layers.0._model.0.weight"]), vars["_logits._model.0.weight"]) mean, log_std = np.split(expected_mean_logstd, 2, axis=-1) if logp_func is None: expected_logp = np.log(norm.pdf(a, mean, np.exp(log_std))) else: expected_logp = logp_func(mean, log_std, a) logp = policy.compute_log_likelihoods( np.array([a]), preprocessed_obs_batch, prev_action_batch=np.array([prev_a]), prev_reward_batch=np.array([prev_r])) check(logp, expected_logp[0], rtol=0.2) # Test all available actions for their logp values. else: for a in [0, 1, 2, 3]: count = actions.count(a) expected_logp = np.log(count / num_actions) logp = policy.compute_log_likelihoods( np.array([a]), preprocessed_obs_batch, prev_action_batch=np.array([prev_a]), prev_reward_batch=np.array([prev_r])) check(logp, expected_logp, rtol=0.3)