def _load_shard_vf(shard, data_files_prefix, env_kwargs, filter_values_fn=None, transform_values_fn=None): data = _load_shard(shard, data_files_prefix) render_env = SokobanEnv(**env_kwargs) data_x = [] data_y = [] vf = ValueLoader() for vf_for_root in data: root = vf.load_vf_for_root(vf_for_root, compressed=True) data = vf.dump_vf_for_root(root) for env_state, v in data: if filter_values_fn: if filter_values_fn(v): continue if transform_values_fn: v = transform_values_fn(v) render_env.restore_full_state(env_state) ob = render_env.render(mode=render_env.mode) data_x.append(ob) data_y.append(v) data_y = np.asarray(data_y) if len(data_y.shape) == 1: data_y = data_y.reshape((len(data_y), 1)) return np.asarray(data_x), data_y, {}
def test_serialization(dim=(8, 8), num_boxes=1, mode='rgb_array', seed=None, curriculum=300): from ctypes import c_uint if not seed: _, seed = seeding.np_random(None) env = SokobanEnv(dim_room=dim, max_steps=100, num_boxes=num_boxes, mode=mode, curriculum=curriculum) env.seed(seed) env.reset() state = env.clone_full_state() obs = env.render(mode='rgb_array') value = np.float32(5.0) shapes = (state.shape, obs.shape, (1, )) type = (state.dtype, obs.dtype, np.float32) buf_size = env.max_steps * np.array([np.prod(x) for x in shapes]) game = [(state, obs, value), (state, obs, value)] serial = serialize_game(game, type, buf_size) zz = np.frombuffer(serial, dtype=np.uint8) dgame = deserialize_game(serial, buf_size, shapes, type) return [[(i == j).all() for i, j in zip(a, b)] for a, b in zip(game, dgame)]
def test_img(): env = SokobanEnv(dim_room=(10, 10), max_steps=100, num_boxes=4, mode='rgb_array', max_distinct_rooms=10) from PIL import Image for i in range(10): env.reset() img = env.render() Image.fromarray(img, "RGB").save("{}.png".format(i))
def test_recover(dim=(13, 13), num_boxes=5, mode='rgb_array', seed=None): if not seed: _, seed = seeding.np_random(None) env = SokobanEnv(dim_room=dim, max_steps=100, num_boxes=num_boxes, mode=mode, max_distinct_rooms=10) env.seed(seed) env.reset() obs = env.render() state = env.clone_full_state() print(state == env.recover_state(obs))
def test_seed(dim=(13, 13), num_boxes=5, mode='rgb_array', seed=None): from ctypes import c_uint if not seed: _, seed = seeding.np_random(None) env = SokobanEnv(dim_room=dim, max_steps=100, num_boxes=num_boxes, mode='rgb_array') env.seed(seed) print("Seed: {}".format(np.uint32(c_uint(seed)))) from PIL import Image env.reset() img = env.render() Image.fromarray(img, "RGB").resize((200, 200)).show()
class ValueFromKerasNet(Value, ABC): def __init__(self, model, env_kwargs): if isinstance(model, str): self.model = load_model(model) else: self.model = model self.env = SokobanEnv(**env_kwargs) self.env.reset() def _network_prediction(self, state): self.env.restore_full_state(state) obs = self.env.render() return self.model.predict(np.expand_dims(obs, axis=0)) def __call__(self, state): raise NotImplementedError
class PolicyFromNet(Policy): def __init__(self, model, env_kwargs): self.render_env = SokobanEnv(**env_kwargs) self.env_n_actions = self.render_env.action_space.n if isinstance(model, str): self.model = load_model(model) else: self.model = model self.env = SokobanEnv(**env_kwargs) self.env.reset() assert len(self.model.outputs) == 1 def best_actions(self, state): self.env.restore_full_state(state) ob = self.env.render() policy = self.model.predict(np.expand_dims(ob, axis=0))[0] best_actions = [np.argmax(policy)] return best_actions
def _load_shard_best_action_ignore_finall(shard, data_files_prefix, env_kwargs): """ Choose best action If all actions are equally good, give special target value (equal to env.action_space.n). For Sokoban this will separate dead ends. (for which there is no good action). """ boards = _load_shard(shard, data_files_prefix) render_env = SokobanEnv(**env_kwargs) data_x = [] data_y = [] data_value = [] vf = ValueLoader() policy = PolicyFromValue(vf, env_kwargs) assert policy.env_n_actions == render_env.action_space.n for vf_for_root in boards: root = vf.load_vf_for_root(vf_for_root, compressed=True) data = vf.dump_vf_for_root(root) for node_state, v in data: if v in [0, -float("inf")]: # TODO(kc): ValuePerfect does not produce some states which can be # obtained after solving game. How to clean it up? continue render_env.restore_full_state(node_state) ob = render_env.render(mode=render_env.mode) data_x.append(ob) best_actions = policy.act(node_state, return_single_action=False) y = np.min(best_actions) one_hot_y = np.zeros(shape=render_env.action_space.n, dtype=np.int) one_hot_y[y] = 1 data_y.append(one_hot_y) data_value.append(v) return np.asarray(data_x), np.asarray(data_y), \ dict(value=np.asarray(data_value))
def process_board_data(compressed_data, target, env_kwargs, sample_data, max_sample_size, random_state): """ Args: compressed_data: dictionary with keys containing ["full_env_state", "perfect_value", "perfect_q"], mapping to compressed arrays. """ render_env = SokobanEnv(**env_kwargs) keys = compressed_data.keys() assert_v2_keys(compressed_data) data = {key: decompress_np_array(compressed_data[key]) for key in keys} assert_env_and_state_match(env_kwargs, data["full_env_state"][0]) filter_values_fn = lambda v, q: False stratified_sample_fn = lambda values, q: stratified_sample( values, q, max_sample_size, random_state) simple_sample_fn = lambda values, q: simple_sample( values, q, max_sample_size, random_state) if target == Target.VF: sample_fn = stratified_sample_fn elif target == Target.VF_SOLVABLE_ONLY: filter_values_fn = lambda v, q: not is_solvable_state(v, q) sample_fn = simple_sample_fn elif target == Target.STATE_TYPE: sample_fn = stratified_sample_fn elif target == Target.BEST_ACTION: filter_values_fn = lambda v, q: not is_solvable_state(v, q) sample_fn = simple_sample_fn elif target == Target.VF_AND_TYPE: sample_fn = stratified_sample_fn elif target == Target.NEXT_FRAME: sample_fn = stratified_sample_fn elif target == Target.DELTA_VALUE: sample_fn = stratified_sample_fn elif target == Target.VF_DISCOUNTED: sample_fn = stratified_sample_fn elif target == Target.BEST_ACTION_FRAMESTACK: filter_values_fn = lambda v, q: not is_solvable_state(v, q) sample_fn = simple_sample_fn elif target == Target.NEXT_FRAME_AND_DONE: sample_fn = stratified_sample_fn else: raise ValueError("Unknown target {}".format(target)) mask = ~np.array([ filter_values_fn(v, q) for v, q in zip(data['perfect_value'], data['perfect_q']) ], dtype=np.bool) data = {key: data[key][mask] for key in keys} if sample_data: sample_ix = sample_fn(data["perfect_value"], data["perfect_q"]) else: raise NotImplemented() if target == Target.DELTA_VALUE: data_x, data_y = extract_delta_value(data, sample_ix, render_env, random_state) elif target == Target.VF_DISCOUNTED: data_x, data_y = extract_discounted_value( sample_ix, states=data["full_env_state"], perfect_v=data["perfect_value"], perfect_q=data["perfect_q"], render_env=render_env, ) elif target == Target.BEST_ACTION_FRAMESTACK: data_x, data_y = extract_best_action_from_framestack( sample_ix, states=data["full_env_state"], perfect_v=data["perfect_value"], perfect_q=data["perfect_q"], render_env=render_env, ) else: data = {key: data[key][sample_ix] for key in keys} if target == Target.NEXT_FRAME: data_x, data_y = extract_next_frame_input_and_target( data["full_env_state"], render_env) else: obs = list() for node_state in data['full_env_state']: render_env.restore_full_state(node_state) ob = render_env.render(mode=render_env.mode) obs.append(ob) data_x = np.array(obs) data_y = extract_target_from_value(perfect_v=data["perfect_value"], perfect_q=data["perfect_q"], target=target) if isinstance(data_y, np.ndarray): assert len(data_y.shape) > 1, "data_y should be batched (if target is " \ "scalar it should have shape (num_samples, 1))" return data_x, data_y, {}