def __init__(self, model, env_kwargs): if isinstance(model, str): self.model = load_model(model) else: self.model = model self.env = SokobanEnv(**env_kwargs) self.env.reset()
def _load_shard_vf(shard, data_files_prefix, env_kwargs, filter_values_fn=None, transform_values_fn=None): data = _load_shard(shard, data_files_prefix) render_env = SokobanEnv(**env_kwargs) data_x = [] data_y = [] vf = ValueLoader() for vf_for_root in data: root = vf.load_vf_for_root(vf_for_root, compressed=True) data = vf.dump_vf_for_root(root) for env_state, v in data: if filter_values_fn: if filter_values_fn(v): continue if transform_values_fn: v = transform_values_fn(v) render_env.restore_full_state(env_state) ob = render_env.render(mode=render_env.mode) data_x.append(ob) data_y.append(v) data_y = np.asarray(data_y) if len(data_y.shape) == 1: data_y = data_y.reshape((len(data_y), 1)) return np.asarray(data_x), data_y, {}
class ChildrenValuePrinter(HumanPrintWrapper): def __init__(self, env, value_fun): """ Args: value_fun: callable: obs, states -> value, which would be call by key `states` """ super().__init__(env) self.render_env = SokobanEnv(**env.init_kwargs) self.value_fun = value_fun def formatted_state_value(self, state): return "{:.2f}".format(self.value_fun(states=state)[0][0]) def build_texts(self, obs, reward, done, info): child_values = list() state = self.env.clone_full_state() value_str = self.formatted_state_value(state) for action in range(self.render_env.action_space.n): self.render_env.restore_full_state(state) self.render_env.step(action) child_state = self.render_env.clone_full_state() child_value_str = self.formatted_state_value(child_state) child_values.append(child_value_str) print('Children values: {}'.format(" ".join(child_values))) return [ 'Value: {}'.format(value_str), 'Children values: {}'.format(" ".join(child_values)) ]
def create_env(seed, dim_room=(13, 13), num_boxes=5): env = SokobanEnv(dim_room=dim_room, max_steps=100, num_boxes=num_boxes, mode='rgb_array', max_distinct_rooms=10) env.seed(seed) return env
def __init__(self, value_fn, env_kwargs, depth=4): self.render_env = SokobanEnv(**env_kwargs) self.env_n_actions = self.render_env.action_space.n self.value_function = value_fn self.env = SokobanEnv(**env_kwargs) self.env.reset() self.depth = depth self.nodes = dict()
def __init__(self, env, value_fun): """ Args: value_fun: callable: obs, states -> value, which would be call by key `states` """ super().__init__(env) self.render_env = SokobanEnv(**env.init_kwargs) self.value_fun = value_fun
def __init__(self, value_function, env_kwargs, nan_for_zero_value=True, copy_negative=True): self.value_function = value_function self.env = SokobanEnv(**env_kwargs) self.env.reset() self.nan_for_zero_value = nan_for_zero_value self.copy_negative_values = copy_negative
def __init__(self, model, env_kwargs): self.render_env = SokobanEnv(**env_kwargs) self.env_n_actions = self.render_env.action_space.n if isinstance(model, str): self.model = load_model(model) else: self.model = model self.env = SokobanEnv(**env_kwargs) self.env.reset() assert len(self.model.outputs) == 1
def test_img(): env = SokobanEnv(dim_room=(10, 10), max_steps=100, num_boxes=4, mode='rgb_array', max_distinct_rooms=10) from PIL import Image for i in range(10): env.reset() img = env.render() Image.fromarray(img, "RGB").save("{}.png".format(i))
def test_one_hot_mode(): dim_room = (10, 10) env = SokobanEnv(dim_room=dim_room, max_steps=100, num_boxes=2, mode='one_hot', max_distinct_rooms=10) obs = env.reset() assert obs.shape == dim_room + (7, ) assert obs.dtype == np.uint8 print(obs.shape)
def test_room_to_binary_map_and_back(): env = SokobanEnv() for _ in range(100): env.reset() flat_state = env.clone_full_state() (state, structure) = render_utils.get_room_state_and_structure( flat_state, env.dim_room) room = render_utils.make_standalone_state(state, structure) binary_map = render_utils.room_to_binary_map(room) converted_room = render_utils.binary_map_to_room(binary_map) assert (converted_room == room).all()
def test_serialization(dim=(8, 8), num_boxes=1, mode='rgb_array', seed=None, curriculum=300): from ctypes import c_uint if not seed: _, seed = seeding.np_random(None) env = SokobanEnv(dim_room=dim, max_steps=100, num_boxes=num_boxes, mode=mode, curriculum=curriculum) env.seed(seed) env.reset() state = env.clone_full_state() obs = env.render(mode='rgb_array') value = np.float32(5.0) shapes = (state.shape, obs.shape, (1, )) type = (state.dtype, obs.dtype, np.float32) buf_size = env.max_steps * np.array([np.prod(x) for x in shapes]) game = [(state, obs, value), (state, obs, value)] serial = serialize_game(game, type, buf_size) zz = np.frombuffer(serial, dtype=np.uint8) dgame = deserialize_game(serial, buf_size, shapes, type) return [[(i == j).all() for i, j in zip(a, b)] for a, b in zip(game, dgame)]
def test_type_counts(dim_room=(13, 13), num_boxes=4): env = SokobanEnv(dim_room=dim_room, max_steps=100, num_boxes=num_boxes, mode='one_hot') ob = env.reset() type_counter = collections.Counter( np.reshape(np.argmax(ob, axis=2), newshape=(-1, ))) def assert_type_count(type_set, number): assert sum(type_counter[type] for type in type_set) == number assert_type_count(OneHotTypeSets.player, 1) assert_type_count(OneHotTypeSets.box, num_boxes) assert_type_count(OneHotTypeSets.target, num_boxes)
class ValueFromKerasNet(Value, ABC): def __init__(self, model, env_kwargs): if isinstance(model, str): self.model = load_model(model) else: self.model = model self.env = SokobanEnv(**env_kwargs) self.env.reset() def _network_prediction(self, state): self.env.restore_full_state(state) obs = self.env.render() return self.model.predict(np.expand_dims(obs, axis=0)) def __call__(self, state): raise NotImplementedError
def test_recover(dim=(13, 13), num_boxes=5, mode='rgb_array', seed=None): if not seed: _, seed = seeding.np_random(None) env = SokobanEnv(dim_room=dim, max_steps=100, num_boxes=num_boxes, mode=mode, max_distinct_rooms=10) env.seed(seed) env.reset() obs = env.render() state = env.clone_full_state() print(state == env.recover_state(obs))
class PolicyFromNet(Policy): def __init__(self, model, env_kwargs): self.render_env = SokobanEnv(**env_kwargs) self.env_n_actions = self.render_env.action_space.n if isinstance(model, str): self.model = load_model(model) else: self.model = model self.env = SokobanEnv(**env_kwargs) self.env.reset() assert len(self.model.outputs) == 1 def best_actions(self, state): self.env.restore_full_state(state) ob = self.env.render() policy = self.model.predict(np.expand_dims(ob, axis=0))[0] best_actions = [np.argmax(policy)] return best_actions
def test_playing(): env = PlayWrapper( InfoDisplayWrapper(RewardPrinter( SokobanEnv(num_boxes=1, game_mode="Magnetic", penalty_pull_action=-0.3)), augment_observations=True, min_text_area_width=500)) env.play()
def test_rendering(): env = InfoDisplayWrapper(RewardPrinter(SokobanEnv()), augment_observations=True, min_text_area_width=500) env.reset() env.step(0) obs = env.render() assert obs.shape == (80, 580, 3) env.render(mode='human') from time import sleep sleep(2)
def render_state(state, tiny=False): # To avoid circular import. from gym_sokoban.envs import SokobanEnv # Cache the surfaces to avoid reloading. if SURFACES is None: globals()['SURFACES'] = SokobanEnv.load_surfaces() if tiny: render_fn = room_to_tiny_world_rgb surface_name = 'tiny_rgb_array' else: render_fn = room_to_rgb surface_name = 'rgb_array' return render_fn(state, surfaces=SURFACES[surface_name])
def _load_shard_best_action_ignore_finall(shard, data_files_prefix, env_kwargs): """ Choose best action If all actions are equally good, give special target value (equal to env.action_space.n). For Sokoban this will separate dead ends. (for which there is no good action). """ boards = _load_shard(shard, data_files_prefix) render_env = SokobanEnv(**env_kwargs) data_x = [] data_y = [] data_value = [] vf = ValueLoader() policy = PolicyFromValue(vf, env_kwargs) assert policy.env_n_actions == render_env.action_space.n for vf_for_root in boards: root = vf.load_vf_for_root(vf_for_root, compressed=True) data = vf.dump_vf_for_root(root) for node_state, v in data: if v in [0, -float("inf")]: # TODO(kc): ValuePerfect does not produce some states which can be # obtained after solving game. How to clean it up? continue render_env.restore_full_state(node_state) ob = render_env.render(mode=render_env.mode) data_x.append(ob) best_actions = policy.act(node_state, return_single_action=False) y = np.min(best_actions) one_hot_y = np.zeros(shape=render_env.action_space.n, dtype=np.int) one_hot_y[y] = 1 data_y.append(one_hot_y) data_value.append(v) return np.asarray(data_x), np.asarray(data_y), \ dict(value=np.asarray(data_value))
def generate_next_frame_and_done_data(env_kwargs, seed, n_trajectories=100, trajectory_len=40, clone_done=100): num_boxes_range = next_frame_and_done_data_params()["num_boxes_range"] if num_boxes_range is None: print("num_boxes_range", num_boxes_range) num_boxes_range = [env_kwargs["num_boxes"]] env_kwargs = deepcopy(env_kwargs) np.random.seed(seed) env_kwargs["num_boxes"] = num_boxes_range[np.random.randint( len(num_boxes_range))] render_env = SokobanEnv(**env_kwargs) render_env.seed(seed) trajectories = list() # [(observations, actions, done), ...] for i in range(n_trajectories): render_env.reset() state = render_env.clone_full_state() # generate random path trajectories.append( random_trajectory(state, render_env, trajectory_len)) # parse trajectories into arrays data_x = list() data_y_next_frame = list() data_y_if_done = list() for obs, actions, done in trajectories: data_x.extend([ image_with_embedded_action(ob, action, render_env.action_space.n) for ob, action in zip(obs[:-1], actions) ]) data_y_next_frame.extend([ob for ob in obs[1:]]) data_y_if_done.extend([False] * (len(actions) - 1) + [done]) if done and (clone_done > 1): data_x.extend([data_x[-1].copy() for _ in range(clone_done)]) data_y_next_frame.extend( [data_y_next_frame[-1].copy() for _ in range(clone_done)]) data_y_if_done.extend( [data_y_if_done[-1] for _ in range(clone_done)]) data_x = np.array(data_x) data_y = { Target.NEXT_FRAME.value: np.array(data_y_next_frame), "if_done": np.array(data_y_if_done).reshape((-1, 1)).astype(int), } return data_x, data_y, {}
def test_seed(dim=(13, 13), num_boxes=5, mode='rgb_array', seed=None): from ctypes import c_uint if not seed: _, seed = seeding.np_random(None) env = SokobanEnv(dim_room=dim, max_steps=100, num_boxes=num_boxes, mode='rgb_array') env.seed(seed) print("Seed: {}".format(np.uint32(c_uint(seed)))) from PIL import Image env.reset() img = env.render() Image.fromarray(img, "RGB").resize((200, 200)).show()
class PolicyFromFullTree(Policy): def __init__(self, value_fn, env_kwargs, depth=4): self.render_env = SokobanEnv(**env_kwargs) self.env_n_actions = self.render_env.action_space.n self.value_function = value_fn self.env = SokobanEnv(**env_kwargs) self.env.reset() self.depth = depth self.nodes = dict() def best_actions(self, state): # Produce all action sequences seq_ = [range(self.env.action_space.n)] * self.depth action_seq = list(product(*seq_)) # print("len(action_seq) {}".format(len(action_seq))) for actions in action_seq: root_action = actions[0] self.env.restore_full_state(state) branch_reward = 0 current_depth = 0 for action in actions: current_depth += 1 ob, reward, done, _ = self.env.step(action) branch_reward += reward node = tuple(self.env.clone_full_state()) if node not in self.nodes: value = self.value_function( states=np.array(node) ) # self.model.predict(np.expand_dims(ob, axis=0))[0] if done: value += 1000 self.nodes[node] = (value, branch_reward, current_depth, root_action, actions[:current_depth]) else: value, previous_reward, previous_depth, _, _ = self.nodes[ node] if previous_depth > current_depth: # if previous_reward > branch_reward: # assert branch_reward > 10., "{} {}".format(previous_reward, branch_reward) self.nodes[node] = (value, branch_reward, current_depth, root_action, actions[:current_depth]) if done: break # self.nodes.values() best_node = max( self.nodes.keys(), key=(lambda node: self.nodes[node][0] + self.nodes[node][1])) node_value, branch_reward, current_depth, root_action, actions = self.nodes[ best_node] # print("Distinct leaves {}".format(len(self.nodes))) # print("Node value {}, reward {:.1f}, depth {}, action {}, actions {}".format( # node_value, branch_reward, current_depth, root_action, actions)) return [root_action]
class QFromV(object): def __init__(self, value_function, env_kwargs, nan_for_zero_value=True, copy_negative=True): self.value_function = value_function self.env = SokobanEnv(**env_kwargs) self.env.reset() self.nan_for_zero_value = nan_for_zero_value self.copy_negative_values = copy_negative @property def env_n_actions(self): return self.env.action_space.n def q_values(self, state): q_values = list() if self.nan_for_zero_value: # Value might not have children for Sokoban success states. if self.value_function(states=state) == 0: return [np.nan] * self.env_n_actions if self.copy_negative_values: # For speed-up val = self.value_function(states=state)[0] if val < 0: return [val] * self.env_n_actions for action in range(self.env_n_actions): self.env.restore_full_state(state) ob, reward, done, _ = self.env.step(action) value = reward child_state = self.env.clone_full_state() if not done: value += self.value_function(states=child_state)[0] q_values.append(float(value)) return q_values
def __init__(self, *, data_files_prefix, env, net, epochs, batch_size, lr, lr_decay=0.0, shards_to_use=None, validation_shards=1, save_every=None, output_dir, histogram_freq=None, validate_every_batch=5000, neptune_first_batch=10000, target="vf", loss=None, n_cores=None, sample_data=False, max_samples_per_board=1000, eval_games_to_play=10, **kwargs): if shards_to_use is None: self.number_of_shards = infer_number_of_shards(data_files_prefix) else: self.number_of_shards = shards_to_use self.validation_shards = validation_shards assert self.validation_shards < self.number_of_shards if self.number_of_shards == 1: print( "WARNING: there is only one shard, so it is used for both training " "and validation.") self.training_shards = [0] self.validation_shards = [0] else: self.training_shards = list( range(self.number_of_shards - self.validation_shards)) self.validation_shards = list( range(self.number_of_shards - self.validation_shards, self.number_of_shards)) self.data_files_prefix = data_files_prefix self.save_every = save_every self.checkpoint_dir = os.path.join(output_dir, "checkpoints", "epoch.{epoch:04d}.hdf5") os.makedirs(self.checkpoint_dir, exist_ok=True) self.exp_dir_path = output_dir self.histogram_freq = histogram_freq self.epochs = epochs self.env_kwargs = env self.render_env = SokobanEnv(**self.env_kwargs) self.render_mode = self.render_env.mode self.target = Target(target) del target print("self.target", self.target) self.loss = loss_for_target(self.target, loss) final_activation = final_network_activation(self.target) net_output_size = net_output_size_for_target( self.target, self.render_env.action_space.n, n_channels_from_mode(env.get("mode", "one_hot"))) input_channels = n_channels_from_mode(env.get("mode", "one_hot")) if self.target in [Target.NEXT_FRAME, Target.NEXT_FRAME_AND_DONE]: input_channels += SokobanEnv(**env).action_space.n if self.target in [Target.DELTA_VALUE, Target.BEST_ACTION_FRAMESTACK]: input_channels *= 2 self.metrics = [self.loss] if isinstance(self.loss, dict): # [0] is a dirty change of metrics for vf_and_type self.metrics = self.metrics[0] self.network = get_network(input_shape=tuple( list(env["dim_room"]) + list((input_channels, ))), output_size=net_output_size, final_activation=final_activation, **net) self.network.compile(optimizer="adam", loss=self.loss, metrics=self.metrics) self.learning_rate_lambda = lambda epoch: lr / (1 + lr_decay * epoch) self.batch_size = batch_size self.validate_every_batch = validate_every_batch self.neptune_first_batch = neptune_first_batch if n_cores is None: n_cores = count_cpu() self.n_cores = n_cores self.sample_data = sample_data self.max_samples_per_board = max_samples_per_board self.random_state = np.random.RandomState(0) self.eval_games_to_play = eval_games_to_play
def process_board_data(compressed_data, target, env_kwargs, sample_data, max_sample_size, random_state): """ Args: compressed_data: dictionary with keys containing ["full_env_state", "perfect_value", "perfect_q"], mapping to compressed arrays. """ render_env = SokobanEnv(**env_kwargs) keys = compressed_data.keys() assert_v2_keys(compressed_data) data = {key: decompress_np_array(compressed_data[key]) for key in keys} assert_env_and_state_match(env_kwargs, data["full_env_state"][0]) filter_values_fn = lambda v, q: False stratified_sample_fn = lambda values, q: stratified_sample( values, q, max_sample_size, random_state) simple_sample_fn = lambda values, q: simple_sample( values, q, max_sample_size, random_state) if target == Target.VF: sample_fn = stratified_sample_fn elif target == Target.VF_SOLVABLE_ONLY: filter_values_fn = lambda v, q: not is_solvable_state(v, q) sample_fn = simple_sample_fn elif target == Target.STATE_TYPE: sample_fn = stratified_sample_fn elif target == Target.BEST_ACTION: filter_values_fn = lambda v, q: not is_solvable_state(v, q) sample_fn = simple_sample_fn elif target == Target.VF_AND_TYPE: sample_fn = stratified_sample_fn elif target == Target.NEXT_FRAME: sample_fn = stratified_sample_fn elif target == Target.DELTA_VALUE: sample_fn = stratified_sample_fn elif target == Target.VF_DISCOUNTED: sample_fn = stratified_sample_fn elif target == Target.BEST_ACTION_FRAMESTACK: filter_values_fn = lambda v, q: not is_solvable_state(v, q) sample_fn = simple_sample_fn elif target == Target.NEXT_FRAME_AND_DONE: sample_fn = stratified_sample_fn else: raise ValueError("Unknown target {}".format(target)) mask = ~np.array([ filter_values_fn(v, q) for v, q in zip(data['perfect_value'], data['perfect_q']) ], dtype=np.bool) data = {key: data[key][mask] for key in keys} if sample_data: sample_ix = sample_fn(data["perfect_value"], data["perfect_q"]) else: raise NotImplemented() if target == Target.DELTA_VALUE: data_x, data_y = extract_delta_value(data, sample_ix, render_env, random_state) elif target == Target.VF_DISCOUNTED: data_x, data_y = extract_discounted_value( sample_ix, states=data["full_env_state"], perfect_v=data["perfect_value"], perfect_q=data["perfect_q"], render_env=render_env, ) elif target == Target.BEST_ACTION_FRAMESTACK: data_x, data_y = extract_best_action_from_framestack( sample_ix, states=data["full_env_state"], perfect_v=data["perfect_value"], perfect_q=data["perfect_q"], render_env=render_env, ) else: data = {key: data[key][sample_ix] for key in keys} if target == Target.NEXT_FRAME: data_x, data_y = extract_next_frame_input_and_target( data["full_env_state"], render_env) else: obs = list() for node_state in data['full_env_state']: render_env.restore_full_state(node_state) ob = render_env.render(mode=render_env.mode) obs.append(ob) data_x = np.array(obs) data_y = extract_target_from_value(perfect_v=data["perfect_value"], perfect_q=data["perfect_q"], target=target) if isinstance(data_y, np.ndarray): assert len(data_y.shape) > 1, "data_y should be batched (if target is " \ "scalar it should have shape (num_samples, 1))" return data_x, data_y, {}