class ChildrenValuePrinter(HumanPrintWrapper): def __init__(self, env, value_fun): """ Args: value_fun: callable: obs, states -> value, which would be call by key `states` """ super().__init__(env) self.render_env = SokobanEnv(**env.init_kwargs) self.value_fun = value_fun def formatted_state_value(self, state): return "{:.2f}".format(self.value_fun(states=state)[0][0]) def build_texts(self, obs, reward, done, info): child_values = list() state = self.env.clone_full_state() value_str = self.formatted_state_value(state) for action in range(self.render_env.action_space.n): self.render_env.restore_full_state(state) self.render_env.step(action) child_state = self.render_env.clone_full_state() child_value_str = self.formatted_state_value(child_state) child_values.append(child_value_str) print('Children values: {}'.format(" ".join(child_values))) return [ 'Value: {}'.format(value_str), 'Children values: {}'.format(" ".join(child_values)) ]
def test_serialization(dim=(8, 8), num_boxes=1, mode='rgb_array', seed=None, curriculum=300): from ctypes import c_uint if not seed: _, seed = seeding.np_random(None) env = SokobanEnv(dim_room=dim, max_steps=100, num_boxes=num_boxes, mode=mode, curriculum=curriculum) env.seed(seed) env.reset() state = env.clone_full_state() obs = env.render(mode='rgb_array') value = np.float32(5.0) shapes = (state.shape, obs.shape, (1, )) type = (state.dtype, obs.dtype, np.float32) buf_size = env.max_steps * np.array([np.prod(x) for x in shapes]) game = [(state, obs, value), (state, obs, value)] serial = serialize_game(game, type, buf_size) zz = np.frombuffer(serial, dtype=np.uint8) dgame = deserialize_game(serial, buf_size, shapes, type) return [[(i == j).all() for i, j in zip(a, b)] for a, b in zip(game, dgame)]
class PolicyFromFullTree(Policy): def __init__(self, value_fn, env_kwargs, depth=4): self.render_env = SokobanEnv(**env_kwargs) self.env_n_actions = self.render_env.action_space.n self.value_function = value_fn self.env = SokobanEnv(**env_kwargs) self.env.reset() self.depth = depth self.nodes = dict() def best_actions(self, state): # Produce all action sequences seq_ = [range(self.env.action_space.n)] * self.depth action_seq = list(product(*seq_)) # print("len(action_seq) {}".format(len(action_seq))) for actions in action_seq: root_action = actions[0] self.env.restore_full_state(state) branch_reward = 0 current_depth = 0 for action in actions: current_depth += 1 ob, reward, done, _ = self.env.step(action) branch_reward += reward node = tuple(self.env.clone_full_state()) if node not in self.nodes: value = self.value_function( states=np.array(node) ) # self.model.predict(np.expand_dims(ob, axis=0))[0] if done: value += 1000 self.nodes[node] = (value, branch_reward, current_depth, root_action, actions[:current_depth]) else: value, previous_reward, previous_depth, _, _ = self.nodes[ node] if previous_depth > current_depth: # if previous_reward > branch_reward: # assert branch_reward > 10., "{} {}".format(previous_reward, branch_reward) self.nodes[node] = (value, branch_reward, current_depth, root_action, actions[:current_depth]) if done: break # self.nodes.values() best_node = max( self.nodes.keys(), key=(lambda node: self.nodes[node][0] + self.nodes[node][1])) node_value, branch_reward, current_depth, root_action, actions = self.nodes[ best_node] # print("Distinct leaves {}".format(len(self.nodes))) # print("Node value {}, reward {:.1f}, depth {}, action {}, actions {}".format( # node_value, branch_reward, current_depth, root_action, actions)) return [root_action]
def test_room_to_binary_map_and_back(): env = SokobanEnv() for _ in range(100): env.reset() flat_state = env.clone_full_state() (state, structure) = render_utils.get_room_state_and_structure( flat_state, env.dim_room) room = render_utils.make_standalone_state(state, structure) binary_map = render_utils.room_to_binary_map(room) converted_room = render_utils.binary_map_to_room(binary_map) assert (converted_room == room).all()
def test_recover(dim=(13, 13), num_boxes=5, mode='rgb_array', seed=None): if not seed: _, seed = seeding.np_random(None) env = SokobanEnv(dim_room=dim, max_steps=100, num_boxes=num_boxes, mode=mode, max_distinct_rooms=10) env.seed(seed) env.reset() obs = env.render() state = env.clone_full_state() print(state == env.recover_state(obs))
def generate_next_frame_and_done_data(env_kwargs, seed, n_trajectories=100, trajectory_len=40, clone_done=100): num_boxes_range = next_frame_and_done_data_params()["num_boxes_range"] if num_boxes_range is None: print("num_boxes_range", num_boxes_range) num_boxes_range = [env_kwargs["num_boxes"]] env_kwargs = deepcopy(env_kwargs) np.random.seed(seed) env_kwargs["num_boxes"] = num_boxes_range[np.random.randint( len(num_boxes_range))] render_env = SokobanEnv(**env_kwargs) render_env.seed(seed) trajectories = list() # [(observations, actions, done), ...] for i in range(n_trajectories): render_env.reset() state = render_env.clone_full_state() # generate random path trajectories.append( random_trajectory(state, render_env, trajectory_len)) # parse trajectories into arrays data_x = list() data_y_next_frame = list() data_y_if_done = list() for obs, actions, done in trajectories: data_x.extend([ image_with_embedded_action(ob, action, render_env.action_space.n) for ob, action in zip(obs[:-1], actions) ]) data_y_next_frame.extend([ob for ob in obs[1:]]) data_y_if_done.extend([False] * (len(actions) - 1) + [done]) if done and (clone_done > 1): data_x.extend([data_x[-1].copy() for _ in range(clone_done)]) data_y_next_frame.extend( [data_y_next_frame[-1].copy() for _ in range(clone_done)]) data_y_if_done.extend( [data_y_if_done[-1] for _ in range(clone_done)]) data_x = np.array(data_x) data_y = { Target.NEXT_FRAME.value: np.array(data_y_next_frame), "if_done": np.array(data_y_if_done).reshape((-1, 1)).astype(int), } return data_x, data_y, {}
class QFromV(object): def __init__(self, value_function, env_kwargs, nan_for_zero_value=True, copy_negative=True): self.value_function = value_function self.env = SokobanEnv(**env_kwargs) self.env.reset() self.nan_for_zero_value = nan_for_zero_value self.copy_negative_values = copy_negative @property def env_n_actions(self): return self.env.action_space.n def q_values(self, state): q_values = list() if self.nan_for_zero_value: # Value might not have children for Sokoban success states. if self.value_function(states=state) == 0: return [np.nan] * self.env_n_actions if self.copy_negative_values: # For speed-up val = self.value_function(states=state)[0] if val < 0: return [val] * self.env_n_actions for action in range(self.env_n_actions): self.env.restore_full_state(state) ob, reward, done, _ = self.env.step(action) value = reward child_state = self.env.clone_full_state() if not done: value += self.value_function(states=child_state)[0] q_values.append(float(value)) return q_values