def train(self, data, rollout_size, online=True, save_path=None): """ :param data: list of dict. [{"obs": arr; "next_obs": arr}] """ self.recoder = DataRecorder(os.path.join(save_path, "training")) for episode, episode_data in enumerate(data): episode_length = len(episode_data["obs"]) obs, act, next_obs, x_pos = episode_data["obs"], episode_data["act"], episode_data["next_obs"], episode_data["x_pos"] episode_novelty = [] if not online: ind = np.random.permutation(episode_length) obs, act, next_obs, x_pos = obs[ind], act[ind], next_obs[ind], x_pos[ind] for start in range(0, episode_length, rollout_size): end = start + rollout_size batch_obs, batch_act, batch_next_obs, batch_x_pos = obs[start:end], act[start:end], next_obs[start:end], x_pos[start:end] novelty = self.sess.run(self.dynamics.novelty, feed_dict={self.dynamics.obs: obs, self.dynamics.ac: act, self.dynamics.next_obs: next_obs}) self.sess.run(self.train_op, feed_dict={self.dynamics.obs: batch_obs, self.dynamics.ac: batch_act, self.dynamics.next_obs: batch_next_obs}) p = pearsonr(x_pos, novelty)[0] logger.info("Episode:{}|Epoch:{}|P:{}".format(episode, start//rollout_size, p)) episode_novelty.append(novelty) self.recoder.store({"x_pos": x_pos, "novelty": novelty, "episode": episode, "epoch": start//rollout_size, "p": p}) plt.figure() plt.scatter(x_pos, novelty) # plt.yscale("log") plt.savefig(os.path.join(save_path, "{}_{}.png".format(episode, start//rollout_size))) plt.close() self.recoder.dump()
def __init__(self, env, model, nsteps, store_data, reward_fn, sample_goal, threshold=None, alt_model=None, use_random_policy_expl=None): super().__init__(env=env, model=model, nsteps=nsteps) assert isinstance( env.action_space, spaces.Discrete ), 'This ACER implementation works only with discrete action spaces!' assert isinstance(env, VecFrameStack) self.nact = env.action_space.n nenv = self.nenv self.nbatch = nenv * nsteps self.batch_ob_shape = (nenv * (nsteps + 1), ) + env.observation_space.shape # self.obs = env.reset() super method do this self.obs_dtype = env.observation_space.dtype self.obs_shape = env.observation_space.shape self.ac_dtype = env.action_space.dtype self.ac_shape = env.action_space.shape self.nstack = self.env.nstack self.nc = self.batch_ob_shape[-1] // self.nstack self.goal_shape = self.model.goal_shape self.goal_as_image = self.model.goal_as_image self.save_path = os.path.join(logger.get_dir(), "runner_data") self.store_data = store_data self.recorder = DataRecorder(self.save_path) self.dynamics = self.model.dynamics self.sample_goal = sample_goal self.threshold = threshold # self.batch_goal_feat_shape = (nenv*(nsteps+1),) + env.observation_space.shape + self.dynamics.feat_shape self.reached_status = np.array([False for _ in range(self.nenv)], dtype=bool) self.goals, self.goal_info = None, None self.reward_fn = reward_fn # self.results_writer = ResultsWriter(os.path.join(save_path, "evaluation.csv")) self.episode = np.ones(self.nenv) self.episode_step = np.zeros(self.nenv) self.episode_reached_step = np.zeros(self.nenv) self.episode_reward_to_go = np.zeros(self.nenv) self.name = self.model.scope.split("acer_")[1] self.alt_model = alt_model self.use_random_policy_expl = use_random_policy_expl if self.use_random_policy_expl: assert alt_model is not None
def __init__(self, *, env, model, nsteps, gamma, lam, save_path, store_data): super().__init__(env=env, model=model, nsteps=nsteps) # Lambda used in GAE (General Advantage Estimation) self.lam = lam # Discount rate self.gamma = gamma self.recorder = DataRecorder(save_path) self.episode = np.zeros(self.nenv) self.timestamp = np.zeros(self.nenv) self.store_data = store_data
class Model: def __init__(self, sess, env, aux_task, feat_dim, lr): self.sess = sess or tf.Session() self.dynamics = Dynamics(sess=self.sess, env=env, auxiliary_task=aux_task, feat_dim=feat_dim, queue_size=1000, normalize_novelty=True) self.obs_shape = env.observation_space.shape self.ac_shape = env.action_space.shape del env self.opt = tf.train.RMSPropOptimizer(lr, decay=0.99) self.aux_loss = self.dynamics.aux_loss self.dyna_loss = self.dynamics.dyna_loss self.loss = self.aux_loss + self.dyna_loss params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES) gradsandvars = self.opt.compute_gradients(self.loss, params) self.train_op = self.opt.apply_gradients(gradsandvars) self.train_history = [] def train(self, data, rollout_size, online=True, save_path=None): """ :param data: list of dict. [{"obs": arr; "next_obs": arr}] """ self.recoder = DataRecorder(os.path.join(save_path, "training")) for episode, episode_data in enumerate(data): episode_length = len(episode_data["obs"]) obs, act, next_obs, x_pos = episode_data["obs"], episode_data["act"], episode_data["next_obs"], episode_data["x_pos"] episode_novelty = [] if not online: ind = np.random.permutation(episode_length) obs, act, next_obs, x_pos = obs[ind], act[ind], next_obs[ind], x_pos[ind] for start in range(0, episode_length, rollout_size): end = start + rollout_size batch_obs, batch_act, batch_next_obs, batch_x_pos = obs[start:end], act[start:end], next_obs[start:end], x_pos[start:end] novelty = self.sess.run(self.dynamics.novelty, feed_dict={self.dynamics.obs: obs, self.dynamics.ac: act, self.dynamics.next_obs: next_obs}) self.sess.run(self.train_op, feed_dict={self.dynamics.obs: batch_obs, self.dynamics.ac: batch_act, self.dynamics.next_obs: batch_next_obs}) p = pearsonr(x_pos, novelty)[0] logger.info("Episode:{}|Epoch:{}|P:{}".format(episode, start//rollout_size, p)) episode_novelty.append(novelty) self.recoder.store({"x_pos": x_pos, "novelty": novelty, "episode": episode, "epoch": start//rollout_size, "p": p}) plt.figure() plt.scatter(x_pos, novelty) # plt.yscale("log") plt.savefig(os.path.join(save_path, "{}_{}.png".format(episode, start//rollout_size))) plt.close() self.recoder.dump()
def __init__(self, runner, model, buffer, log_interval): self.runner = runner self.model = model self.buffer = buffer self.log_interval = log_interval self.tstart = None self.keys = ["episode_return", "episode_length", "rewards", "her_gain"] self.episode_stats = EpisodeStats(maxlen=10, keys=self.keys) self.steps = 0 self.save_interval = self.runner.save_interval self.recoder = DataRecorder(os.path.join(logger.get_dir(), "samples")) sess = self.model.sess self.save = functools.partial(save_variables, sess=sess, variables=self.model.params)
def __init__(self, env, model, nsteps, save_interval): super().__init__(env=env, model=model, nsteps=nsteps) assert isinstance( env.action_space, spaces.Discrete ), 'This ACER implementation works only with discrete action spaces!' self.nact = env.action_space.n nenv = self.nenv self.nbatch = nenv * nsteps self.batch_ob_shape = (nenv * (nsteps + 1), ) + env.observation_space.shape # self.obs = env.reset() self.obs_dtype = env.observation_space.dtype self.obs_shape = env.observation_space.shape self.ac_dtype = env.action_space.dtype self.recoder = DataRecorder( os.path.join(logger.get_dir(), "runner_data")) self.save_interval = save_interval self.size = [int(x) for x in self.env.spec.id.split("-")[2].split("x")] self.desired_pos = np.asarray(self.size) - 1 logger.info("-" * 50) logger.info("-" * 15, "desired_pos:", self.desired_pos, "-" * 15) logger.info("-" * 50) self.goals, self.goal_infos = self.get_goal(self.nenv) self.episode_step = np.zeros(self.nenv, dtype=np.int32) self.episode = np.zeros(self.nenv, dtype=np.int32)
def __init__(self, env, model, nsteps, total_steps, save_interval, her): super().__init__(env=env, model=model, nsteps=nsteps) assert isinstance( env.action_space, spaces.Discrete ), 'This ACER implementation works only with discrete action spaces!' self.nact = env.action_space.n nenv = self.nenv self.nbatch = nenv * nsteps self.batch_ob_shape = (nenv * (nsteps + 1), ) + env.observation_space.shape # self.obs = env.reset() self.obs_dtype = env.observation_space.dtype self.obs_shape = env.observation_space.shape self.ac_dtype = env.action_space.dtype self.recoder = DataRecorder( os.path.join(logger.get_dir(), "runner_data")) self.save_interval = save_interval self.total_steps = total_steps self.maze_shape = [ int(x) for x in self.env.spec.id.split("-")[2].split("x") ] self.desired_pos = np.asarray(self.maze_shape) - 1 logger.info("-" * 50) logger.info("-" * 15, "desired_pos:", self.desired_pos, "-" * 15) logger.info("-" * 50) self.her = her assert self.nenv == 1 self.controller = MetaController(self.maze_shape, env.observation_space.shape, env.observation_space.dtype) self.goal_infos = [{} for _ in range(self.nenv)] self.goals = [self.controller.sample_goal() for _ in range(self.nenv)] self.mem = "" self.episode_step = np.zeros(self.nenv, dtype=np.int32) self.episode = np.zeros(self.nenv, dtype=np.int32) self.max_episode_length = 1000
def __init__(self, env, model, nsteps, total_steps, save_interval, her): super().__init__(env=env, model=model, nsteps=nsteps) assert isinstance(env.action_space, spaces.Discrete), 'This ACER implementation works only with discrete action spaces!' self.nact = env.action_space.n nenv = self.nenv self.nbatch = nenv * nsteps self.batch_ob_shape = (nenv*(nsteps+1),) + env.observation_space.shape # self.obs = env.reset() self.obs_dtype = env.observation_space.dtype self.obs_shape = env.observation_space.shape self.ac_dtype = env.action_space.dtype self.recoder = DataRecorder(os.path.join(logger.get_dir(), "runner_data")) self.save_interval = save_interval self.total_steps = total_steps self.maze_shape = [int(x) for x in self.env.spec.id.split("-")[2].split("x")] self.desired_pos = arr_to_one_hot(np.asarray(self.maze_shape) - 1, ncat=self.maze_shape[0]) logger.info("-"*50) logger.info("-"*15, "desired_pos:", self.desired_pos, "-"*15) logger.info("-"*50) self.her = her assert self.nenv == 1 self.controller = MetaController(self.maze_shape, env.observation_space.shape, env.observation_space.dtype) self.allowed_step = [np.prod(self.maze_shape)*10] self.allowed_step = [np.inf] self.goal_infos = [{}] self.goals = np.array([self.controller.sample_goal()]) self.aux_goal = np.copy(self.goals[0]) self.mem = "" self.episode_step = np.zeros(self.nenv, dtype=np.int32) self.episode = np.zeros(self.nenv, dtype=np.int32) self.aux_step = np.zeros(self.nenv, dtype=np.int32) self.aux_dones = np.empty(self.nenv, dtype=bool) self.max_episode_length = 3000 self.aux_dones.fill(False) self.aux_entropy = 0. self.tar_entropy = 0.
class Runner(AbstractEnvRunner): """ We use this object to make a mini batch of experiences __init__: - Initialize the runner run(): - Make a mini batch """ def __init__(self, *, env, model, nsteps, gamma, lam, save_path, store_data): super().__init__(env=env, model=model, nsteps=nsteps) # Lambda used in GAE (General Advantage Estimation) self.lam = lam # Discount rate self.gamma = gamma self.recorder = DataRecorder(save_path) self.episode = np.zeros(self.nenv) self.timestamp = np.zeros(self.nenv) self.store_data = store_data def run(self): # Here, we init the lists that will contain the mb of experiences mb_obs, mb_rewards, mb_actions, mb_values, mb_dones, mb_neglogpacs = [],[],[],[],[],[] mb_states = self.states epinfos = [] # For n in range number of steps for _ in range(self.nsteps): # Given observations, get action value and neglopacs # We already have self.obs because Runner superclass run self.obs[:] = env.reset() on init obs_tmp = self.obs.copy() actions, values, self.states, neglogpacs = self.model.step( self.obs, S=self.states, M=self.dones) mb_obs.append(self.obs.copy()) mb_actions.append(actions) mb_values.append(values) mb_neglogpacs.append(neglogpacs) mb_dones.append(self.dones) # Take actions in env and look the results # Infos contains a ton of useful informations self.obs[:], rewards, self.dones, infos = self.env.step(actions) self.timestamp += 1 # todo: add x,y,obs to pkl file. for env_idx, info in enumerate(infos): maybeepinfo = info.get('episode') if self.store_data: data = dict(episode=self.episode[env_idx], timestamp=self.timestamp[env_idx], x_pos=info["x_pos"], y_pos=info["y_pos"], obs=obs_tmp[env_idx], next_obs=self.obs[env_idx], act=actions[env_idx], value=values[env_idx]) self.recorder.store(data) if maybeepinfo: epinfos.append(maybeepinfo) if self.store_data: self.episode[env_idx] += 1 self.timestamp[env_idx] = 0 self.recorder.dump() mb_rewards.append(rewards) #batch of steps to batch of rollouts mb_obs = np.asarray(mb_obs, dtype=self.obs.dtype) mb_rewards = np.asarray(mb_rewards, dtype=np.float32) mb_actions = np.asarray(mb_actions) mb_values = np.asarray(mb_values, dtype=np.float32) mb_neglogpacs = np.asarray(mb_neglogpacs, dtype=np.float32) mb_dones = np.asarray(mb_dones, dtype=np.bool) last_values = self.model.value(self.obs, S=self.states, M=self.dones) # discount/bootstrap off value fn mb_returns = np.zeros_like(mb_rewards) mb_advs = np.zeros_like(mb_rewards) lastgaelam = 0 for t in reversed(range(self.nsteps)): if t == self.nsteps - 1: nextnonterminal = 1.0 - self.dones nextvalues = last_values else: nextnonterminal = 1.0 - mb_dones[t + 1] nextvalues = mb_values[t + 1] delta = mb_rewards[ t] + self.gamma * nextvalues * nextnonterminal - mb_values[t] mb_advs[ t] = lastgaelam = delta + self.gamma * self.lam * nextnonterminal * lastgaelam mb_returns = mb_advs + mb_values return (*map(sf01, (mb_obs, mb_returns, mb_dones, mb_actions, mb_values, mb_neglogpacs)), mb_states, epinfos)
class Acer: def __init__(self, runner, model, buffer, log_interval): self.runner = runner self.model = model self.buffer = buffer self.log_interval = log_interval self.tstart = None self.keys = ["episode_return", "episode_length", "rewards", "her_gain"] self.episode_stats = EpisodeStats(maxlen=10, keys=self.keys) self.steps = 0 self.save_interval = self.runner.save_interval self.recoder = DataRecorder(os.path.join(logger.get_dir(), "samples")) sess = self.model.sess self.save = functools.partial(save_variables, sess=sess, variables=self.model.params) def call(self, replay_start, nb_train_epoch): runner, model, buffer, steps = self.runner, self.model, self.buffer, self.steps results = runner.run() buffer.put(results) self.record_episode_info(results["episode_info"]) obs, next_obs, actions, rewards, mus, dones, masks, goal_obs = self.adjust_shape( results) names_ops, values_ops = model.train_policy(obs, next_obs, actions, rewards, dones, mus, model.initial_state, masks, steps, goal_obs) if buffer.has_atleast(replay_start): for i in range(nb_train_epoch): if i == 0: results = buffer.get(use_cache=False) else: results = buffer.get(use_cache=True) obs, next_obs, actions, rewards, mus, dones, masks, goal_obs = self.adjust_shape( results) names_ops, values_ops = model.train_policy( obs, next_obs, actions, rewards, dones, mus, model.initial_state, masks, steps, goal_obs) self.episode_stats.feed(np.mean(rewards), "rewards") self.episode_stats.feed(results["her_gain"], "her_gain") if int(steps / runner.nbatch) % self.log_interval == 0: names_ops, values_ops = names_ops + [ "memory_usage(GB)" ], values_ops + [self.buffer.memory_usage] self.log(names_ops, values_ops) if int(steps / runner.nbatch) % (self.log_interval * 200) == 0: self.save( os.path.join(logger.get_dir(), "{}.pkl".format(self.steps))) if self.save_interval > 0 and int( steps / runner.nbatch) % self.save_interval == 0: results["acer_steps"] = self.steps self.recoder.store(results) self.recoder.dump() def adjust_shape(self, results): runner = self.runner obs = results["obs"][:, :-1].copy() # next_obs = results["obs"][:, 1:].copy() next_obs = results["next_obs"].copy() obs = obs.reshape((runner.nbatch, ) + runner.obs_shape) next_obs = next_obs.reshape((runner.nbatch, ) + runner.obs_shape) actions = results["actions"].reshape(runner.nbatch) rewards = results["rewards"].reshape(runner.nbatch) mus = results["mus"].reshape([runner.nbatch, runner.nact]) dones = results["dones"].reshape([runner.nbatch]) masks = results["masks"].reshape([runner.batch_ob_shape[0]]) goal_obs = results["goal_obs"].reshape((runner.nbatch, ) + runner.obs_shape) return obs, next_obs, actions, rewards, mus, dones, masks, goal_obs def record_episode_info(self, episode_info): returns = episode_info.get("episode", None) if returns: self.episode_stats.feed(returns["r"], "episode_return") self.episode_stats.feed(returns["l"], "episode_length") def log(self, names_ops, values_ops): logger.record_tabular("total_timesteps", self.steps) logger.record_tabular("fps", int(self.steps / (time.time() - self.tstart))) for name, val in zip(names_ops, values_ops): logger.record_tabular(name, float(val)) for key in self.keys: logger.record_tabular(key, self.episode_stats.get_mean(key)) logger.dump_tabular()
def run(self, acer_steps): if self.recoder is None: self.recoder = DataRecorder(os.path.join(logger.get_dir(), "runner_data")) if self.wrong_recorder is None: self.wrong_recorder = DataRecorder(os.path.join(logger.get_dir(), "wrong_data")) mb_obs, mb_next_obs, mb_actions, mb_rewards, mb_mus, mb_dones, mb_death = [], [], [], [], [], [], [] mb_next_obs_infos, mb_desired_goal_infos = [], [] mb_achieved_goal, mb_next_achieved_goal, mb_desired_goal, mb_desired_goal_state = [], [], [], [] for step in range(self.nsteps): actions, mus = self.model.step({ 'obs': self.obs.copy(), 'achieved_goal': self.achieved_goal.copy(), 'desired_goal': self.desired_goal.copy(), 'desired_goal_state': self.desired_goal_state.copy() }) mb_obs.append(np.copy(self.obs)) mb_achieved_goal.append(np.copy(self.achieved_goal)) mb_desired_goal.append(np.copy(self.desired_goal)) mb_desired_goal_state.append(np.copy(self.desired_goal_state)) mb_actions.append(actions) mb_mus.append(mus) mb_desired_goal_infos.append(np.copy(self.desired_goal_info)) # step if self.dict_obs: dict_obs, _, dones, infos = self.env.step(actions) obs, achieved_goal = dict_obs['observation'], dict_obs['achieved_goal'] achieved_goal = np.tile(achieved_goal, [1, self.nb_tile]) # expand from 2-d to 256-d else: obs, _, dones, infos = self.env.step(actions) rewards = np.zeros(self.nenv, np.float32) death = np.array([False for _ in range(self.nenv)], dtype=np.bool) self.episode_step += 1 for e in range(self.nenv): if infos[e]['x_pos'] == 65535: infos[e]['x_pos'] = 0 # get real next obs and achieved goal next_obs = obs.copy() next_achieved_goal = goal_info_to_embedding(infos, goal_dim=self.achieved_goal.shape[-1]) for e in range(self.nenv): if dones[e]: if self.dict_obs: _dict_obs = infos[e]['next_obs'] _obs = _dict_obs['observation'] else: _obs = infos[e].get('next_obs') assert _obs is not None next_obs[e] = _obs mb_next_obs.append(next_obs) mb_next_obs_infos.append(infos) mb_next_achieved_goal.append(next_achieved_goal) # detecting wrong x_pos: for e in range(self.nenv): x_pos = infos[e].get('x_pos') if x_pos > 3000: logger.info('detected a wrong x_pos:{}'.format(x_pos)) data = {'obs': self.obs[e], 'next_obs': obs[e], 'action': actions[e], 'info': infos[e], 'episode_step': self.episode_step[e], 'true_next_obs': next_obs[e], 'acer_step': acer_steps} self.wrong_recorder.store(data) self.wrong_recorder.dump() # achieved & episode done for e in range(self.nenv): reached = self.check_goal_reached_v2(infos[e], self.desired_goal_info[e]) if reached or self.episode_step[e] > self.curriculum.allow_step or infos[e]["x_pos"] > self.desired_goal_info[e]["x_pos"] + 100: # log info final_pos = {"x_pos": infos[e]["x_pos"], "y_pos": infos[e]["y_pos"]} if reached: succ = True else: succ = False self.recoder.store(dict(env=e, succ=succ, length=self.episode_step[e], final_pos=final_pos)) logger.info(self.TEMPLATE.format(e, succ, self.desired_goal_info[e], final_pos, self.episode_step[e])) # episode info self.log_episode_step.append(self.episode_step[e]) self.log_episode_success.append(1.0 if succ else 0.0) self.log_episode_x_pos.append(infos[e]['x_pos']) self.log_episode_y_pos.append(infos[e]['y_pos']) self.episode_step[e] = 0 # reward and dones if reached: rewards[e] = 1.0 dones[e] = True # reset if self.dict_obs: _dict_obs = self.env.reset_v2(e) obs[e], achieved_goal[e] = _dict_obs['observation'][0], np.tile(_dict_obs['achieved_goal'][0], self.nb_tile) assert np.array_equal(achieved_goal[e], np.tile(np.array([40., 176.]), self.nb_tile)) else: _obs = self.env.reset_v2(e)[0] obs[e] = _obs # curriculum self.curriculum.update(succ=succ, acer_steps=acer_steps) self.desired_goal[e], self.desired_goal_state[e], self.desired_goal_info[e] = self.curriculum.get_current_target(nb_goal=1) elif dones[e]: # log info final_pos = {"x_pos": infos[e]["x_pos"], "y_pos": infos[e]["y_pos"]} self.recoder.store(dict(env=e, succ=False, length=self.episode_step[e], final_pos=final_pos)) logger.info(self.TEMPLATE.format(e, False, self.desired_goal_info[e], final_pos, self.episode_step[e])) # episode info self.log_episode_step.append(self.episode_step[e]) self.log_episode_success.append(0.0) self.log_episode_x_pos.append(infos[e]['x_pos']) self.log_episode_y_pos.append(infos[e]['y_pos']) self.episode_step[e] = 0 # reward and death info if infos[e]['is_dying'] or infos[e]['is_dead']: death[e] = True if self.include_death: rewards[e] = -1 # curriculum self.curriculum.update(succ=False, acer_steps=acer_steps) self.desired_goal[e], self.desired_goal_state[e], self.desired_goal_info[e] = self.curriculum.get_current_target(nb_goal=1) # states information for statefull models like LSTM self.obs = obs if self.dict_obs: self.achieved_goal = achieved_goal mb_rewards.append(rewards) mb_death.append(death) mb_dones.append(dones) mb_obs = np.asarray(mb_obs, dtype=self.obs_dtype).swapaxes(1, 0) mb_next_obs = np.asarray(mb_next_obs, dtype=self.obs_dtype).swapaxes(1, 0) mb_achieved_goal = np.asarray(mb_achieved_goal, dtype=np.float32).swapaxes(1, 0) mb_next_achieved_goal = np.asarray(mb_next_achieved_goal, dtype=np.float32).swapaxes(1, 0) mb_desired_goal = np.asarray(mb_desired_goal, dtype=np.float32).swapaxes(1, 0) mb_desired_goal_state = np.asarray(mb_desired_goal_state, dtype=self.obs_dtype).swapaxes(1, 0) mb_actions = np.asarray(mb_actions, dtype=self.ac_dtype).swapaxes(1, 0) mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0) mb_mus = np.asarray(mb_mus, dtype=np.float32).swapaxes(1, 0) mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0) mb_death = np.asarray(mb_death, dtype=np.bool).swapaxes(1, 0) mb_next_obs_infos = np.asarray(mb_next_obs_infos, dtype=object).swapaxes(1, 0) mb_desired_goal_infos = np.asarray(mb_desired_goal_infos, dtype=object).swapaxes(1, 0) if not np.array_equal(mb_rewards, self.reward_fn(mb_next_obs_infos, mb_desired_goal_infos)): import ipdb ipdb.set_trace() results = dict( obs=mb_obs, next_obs=mb_next_obs, achieved_goal=mb_achieved_goal, next_achieved_goal=mb_next_achieved_goal, desired_goal=mb_desired_goal, desired_goal_state=mb_desired_goal_state, actions=mb_actions, rewards=mb_rewards, mus=mb_mus, dones=mb_dones, deaths=mb_death, next_obs_infos=mb_next_obs_infos, desired_goal_infos=mb_desired_goal_infos, ) return results
class Runner: TEMPLATE = 'env_{} {}!|goal:{}|final_pos:{}|length:{}' def __init__(self, env, model, curriculum, nsteps, reward_fn, threshold): assert isinstance(env.action_space, spaces.Discrete), 'This ACER implementation works only with discrete action spaces!' assert isinstance(env, VecFrameStack) self.env = env self.model = model self.policy_inputs = self.model.policy_inputs self.nenv = nenv = env.num_envs if hasattr(env, 'num_envs') else 1 self.nact = env.action_space.n self.nbatch = nenv * nsteps self.obs_shape = self.model.obs_shape self.obs_dtype = self.model.obs_dtype self.ac_dtype = env.action_space.dtype self.achieved_goal_shape = self.model.achieved_goal_sh self.desired_goal_shape = self.model.desired_goal_sh self.desired_goal_state_shape = self.model.desired_goal_state_sh self.dict_obs = isinstance(self.env.observation_space, spaces.Dict) self.obs = np.zeros((nenv,) + self.obs_shape, dtype=self.obs_dtype) self.achieved_goal = np.zeros((nenv, ) + self.achieved_goal_shape, dtype=np.float32) self.desired_goal = np.zeros((nenv, ) + self.desired_goal_shape, dtype=np.float32) self.desired_goal_state = np.zeros((nenv, ) + self.desired_goal_state_shape, dtype=self.obs_dtype) self.desired_goal_info = np.zeros((nenv, ), dtype=object) self.nb_tile = self.achieved_goal.shape[-1] // 2 if self.dict_obs: dict_obs = self.env.reset() self.obs[:] = dict_obs['observation'] achieved_goal = dict_obs["achieved_goal"] self.achieved_goal[:] = np.tile(achieved_goal, [1, self.nb_tile]) else: self.obs[:] = self.env.reset() self.nsteps = nsteps self.curriculum = curriculum self.desired_goal[:], self.desired_goal_state[:], self.desired_goal_info[:] = self.curriculum.get_current_target(nb_goal=self.nenv) self.recoder = None self.wrong_recorder = None self.episode_step = np.zeros(self.nenv, dtype=np.int32) self.reward_fn = reward_fn self.threshold = threshold self.include_death = False self.log_episode_step = deque(maxlen=10) self.log_episode_success = deque(maxlen=10) self.log_episode_x_pos = deque(maxlen=10) self.log_episode_y_pos = deque(maxlen=10) def run(self, acer_steps): if self.recoder is None: self.recoder = DataRecorder(os.path.join(logger.get_dir(), "runner_data")) if self.wrong_recorder is None: self.wrong_recorder = DataRecorder(os.path.join(logger.get_dir(), "wrong_data")) mb_obs, mb_next_obs, mb_actions, mb_rewards, mb_mus, mb_dones, mb_death = [], [], [], [], [], [], [] mb_next_obs_infos, mb_desired_goal_infos = [], [] mb_achieved_goal, mb_next_achieved_goal, mb_desired_goal, mb_desired_goal_state = [], [], [], [] for step in range(self.nsteps): actions, mus = self.model.step({ 'obs': self.obs.copy(), 'achieved_goal': self.achieved_goal.copy(), 'desired_goal': self.desired_goal.copy(), 'desired_goal_state': self.desired_goal_state.copy() }) mb_obs.append(np.copy(self.obs)) mb_achieved_goal.append(np.copy(self.achieved_goal)) mb_desired_goal.append(np.copy(self.desired_goal)) mb_desired_goal_state.append(np.copy(self.desired_goal_state)) mb_actions.append(actions) mb_mus.append(mus) mb_desired_goal_infos.append(np.copy(self.desired_goal_info)) # step if self.dict_obs: dict_obs, _, dones, infos = self.env.step(actions) obs, achieved_goal = dict_obs['observation'], dict_obs['achieved_goal'] achieved_goal = np.tile(achieved_goal, [1, self.nb_tile]) # expand from 2-d to 256-d else: obs, _, dones, infos = self.env.step(actions) rewards = np.zeros(self.nenv, np.float32) death = np.array([False for _ in range(self.nenv)], dtype=np.bool) self.episode_step += 1 for e in range(self.nenv): if infos[e]['x_pos'] == 65535: infos[e]['x_pos'] = 0 # get real next obs and achieved goal next_obs = obs.copy() next_achieved_goal = goal_info_to_embedding(infos, goal_dim=self.achieved_goal.shape[-1]) for e in range(self.nenv): if dones[e]: if self.dict_obs: _dict_obs = infos[e]['next_obs'] _obs = _dict_obs['observation'] else: _obs = infos[e].get('next_obs') assert _obs is not None next_obs[e] = _obs mb_next_obs.append(next_obs) mb_next_obs_infos.append(infos) mb_next_achieved_goal.append(next_achieved_goal) # detecting wrong x_pos: for e in range(self.nenv): x_pos = infos[e].get('x_pos') if x_pos > 3000: logger.info('detected a wrong x_pos:{}'.format(x_pos)) data = {'obs': self.obs[e], 'next_obs': obs[e], 'action': actions[e], 'info': infos[e], 'episode_step': self.episode_step[e], 'true_next_obs': next_obs[e], 'acer_step': acer_steps} self.wrong_recorder.store(data) self.wrong_recorder.dump() # achieved & episode done for e in range(self.nenv): reached = self.check_goal_reached_v2(infos[e], self.desired_goal_info[e]) if reached or self.episode_step[e] > self.curriculum.allow_step or infos[e]["x_pos"] > self.desired_goal_info[e]["x_pos"] + 100: # log info final_pos = {"x_pos": infos[e]["x_pos"], "y_pos": infos[e]["y_pos"]} if reached: succ = True else: succ = False self.recoder.store(dict(env=e, succ=succ, length=self.episode_step[e], final_pos=final_pos)) logger.info(self.TEMPLATE.format(e, succ, self.desired_goal_info[e], final_pos, self.episode_step[e])) # episode info self.log_episode_step.append(self.episode_step[e]) self.log_episode_success.append(1.0 if succ else 0.0) self.log_episode_x_pos.append(infos[e]['x_pos']) self.log_episode_y_pos.append(infos[e]['y_pos']) self.episode_step[e] = 0 # reward and dones if reached: rewards[e] = 1.0 dones[e] = True # reset if self.dict_obs: _dict_obs = self.env.reset_v2(e) obs[e], achieved_goal[e] = _dict_obs['observation'][0], np.tile(_dict_obs['achieved_goal'][0], self.nb_tile) assert np.array_equal(achieved_goal[e], np.tile(np.array([40., 176.]), self.nb_tile)) else: _obs = self.env.reset_v2(e)[0] obs[e] = _obs # curriculum self.curriculum.update(succ=succ, acer_steps=acer_steps) self.desired_goal[e], self.desired_goal_state[e], self.desired_goal_info[e] = self.curriculum.get_current_target(nb_goal=1) elif dones[e]: # log info final_pos = {"x_pos": infos[e]["x_pos"], "y_pos": infos[e]["y_pos"]} self.recoder.store(dict(env=e, succ=False, length=self.episode_step[e], final_pos=final_pos)) logger.info(self.TEMPLATE.format(e, False, self.desired_goal_info[e], final_pos, self.episode_step[e])) # episode info self.log_episode_step.append(self.episode_step[e]) self.log_episode_success.append(0.0) self.log_episode_x_pos.append(infos[e]['x_pos']) self.log_episode_y_pos.append(infos[e]['y_pos']) self.episode_step[e] = 0 # reward and death info if infos[e]['is_dying'] or infos[e]['is_dead']: death[e] = True if self.include_death: rewards[e] = -1 # curriculum self.curriculum.update(succ=False, acer_steps=acer_steps) self.desired_goal[e], self.desired_goal_state[e], self.desired_goal_info[e] = self.curriculum.get_current_target(nb_goal=1) # states information for statefull models like LSTM self.obs = obs if self.dict_obs: self.achieved_goal = achieved_goal mb_rewards.append(rewards) mb_death.append(death) mb_dones.append(dones) mb_obs = np.asarray(mb_obs, dtype=self.obs_dtype).swapaxes(1, 0) mb_next_obs = np.asarray(mb_next_obs, dtype=self.obs_dtype).swapaxes(1, 0) mb_achieved_goal = np.asarray(mb_achieved_goal, dtype=np.float32).swapaxes(1, 0) mb_next_achieved_goal = np.asarray(mb_next_achieved_goal, dtype=np.float32).swapaxes(1, 0) mb_desired_goal = np.asarray(mb_desired_goal, dtype=np.float32).swapaxes(1, 0) mb_desired_goal_state = np.asarray(mb_desired_goal_state, dtype=self.obs_dtype).swapaxes(1, 0) mb_actions = np.asarray(mb_actions, dtype=self.ac_dtype).swapaxes(1, 0) mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0) mb_mus = np.asarray(mb_mus, dtype=np.float32).swapaxes(1, 0) mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0) mb_death = np.asarray(mb_death, dtype=np.bool).swapaxes(1, 0) mb_next_obs_infos = np.asarray(mb_next_obs_infos, dtype=object).swapaxes(1, 0) mb_desired_goal_infos = np.asarray(mb_desired_goal_infos, dtype=object).swapaxes(1, 0) if not np.array_equal(mb_rewards, self.reward_fn(mb_next_obs_infos, mb_desired_goal_infos)): import ipdb ipdb.set_trace() results = dict( obs=mb_obs, next_obs=mb_next_obs, achieved_goal=mb_achieved_goal, next_achieved_goal=mb_next_achieved_goal, desired_goal=mb_desired_goal, desired_goal_state=mb_desired_goal_state, actions=mb_actions, rewards=mb_rewards, mus=mb_mus, dones=mb_dones, deaths=mb_death, next_obs_infos=mb_next_obs_infos, desired_goal_infos=mb_desired_goal_infos, ) return results def check_goal_reached_v2(self, obs_info, goal_info): obs_x, obs_y = float(obs_info["x_pos"]), float(obs_info["y_pos"]) goal_x, goal_y = float(goal_info["x_pos"]), float(goal_info["y_pos"]) diff_x = abs(obs_x - goal_x) diff_y = abs(obs_y - goal_y) if diff_x <= self.threshold[0] and diff_y <= self.threshold[1]: status = True else: status = False return status def evaluate(self): tstart = time.time() n_episode = 0 self.log_episode_step = deque(maxlen=10) self.log_episode_success = deque(maxlen=10) self.log_episode_x_pos = deque(maxlen=10) self.log_episode_y_pos = deque(maxlen=10) self.episode_step[:] = 0 if self.dict_obs: dict_obs = self.env.reset() self.obs[:] = dict_obs['observation'] achieved_goal = dict_obs["achieved_goal"] self.achieved_goal[:] = np.tile(achieved_goal, [1, self.nb_tile]) else: self.obs[:] = self.env.reset() while n_episode < 10: while True: actions, mus = self.model.step({ 'obs': self.obs.copy(), 'achieved_goal': self.achieved_goal.copy(), 'desired_goal': self.desired_goal.copy(), 'desired_goal_state': self.desired_goal_state.copy() }) # step if self.dict_obs: dict_obs, _, dones, infos = self.env.step(actions) obs, achieved_goal = dict_obs['observation'], dict_obs['achieved_goal'] achieved_goal = np.tile(achieved_goal, [1, self.nb_tile]) # expand from 2-d to 256-d else: obs, _, dones, infos = self.env.step(actions) self.episode_step += 1 for e in range(self.nenv): if infos[e]['x_pos'] == 65535: infos[e]['x_pos'] = 0 # achieved & episode done for e in range(self.nenv): reached = self.check_goal_reached_v2(infos[e], self.desired_goal_info[e]) if reached or self.episode_step[e] > self.curriculum.allow_step or infos[e]["x_pos"] > self.desired_goal_info[e]["x_pos"] + 100: self.log_episode_step.append(self.episode_step[e]) self.log_episode_success.append(1.0 if reached else 0.0) self.log_episode_x_pos.append(infos[e]['x_pos']) self.log_episode_y_pos.append(infos[e]['y_pos']) self.episode_step[e] = 0 dones[e] = True n_episode += 1 # reset if self.dict_obs: _dict_obs = self.env.reset_v2(e) obs[e], achieved_goal[e] = _dict_obs['observation'][0], np.tile(_dict_obs['achieved_goal'][0], self.nb_tile) assert np.array_equal(achieved_goal[e], np.tile(np.array([40., 176.]), self.nb_tile)) else: _obs = self.env.reset_v2(e)[0] obs[e] = _obs elif dones[e]: # episode info self.log_episode_step.append(self.episode_step[e]) self.log_episode_success.append(0.0) self.log_episode_x_pos.append(infos[e]['x_pos']) self.log_episode_y_pos.append(infos[e]['y_pos']) self.episode_step[e] = 0 n_episode += 1 # states information for statefull models like LSTM self.obs = obs if self.dict_obs: self.achieved_goal = achieved_goal if n_episode >= 10: break logs = list() logs.append(('test/final_x_pos', np.mean(self.log_episode_x_pos))) logs.append(('test/final_y_pos', np.mean(self.log_episode_y_pos))) logs.append(('test/success', np.mean(self.log_episode_success))) logs.append(('test/episode_length', np.mean(self.log_episode_step))) logs.append(('time/evaluate', time.time() - tstart)) return logs def logs(self): logs = list() logs.append(('train/final_x_pos', self._safe_mean(self.log_episode_x_pos))) logs.append(('train/final_y_pos', self._safe_mean(self.log_episode_y_pos))) logs.append(('train/success', self._safe_mean(self.log_episode_success))) logs.append(('train/episode_length', self._safe_mean(self.log_episode_step))) return logs @staticmethod def _safe_mean(x): if len(x) == 0: return 0. else: return np.mean(x)
class Runner(AbstractEnvRunner): def __init__(self, env, model, nsteps, store_data, reward_fn, sample_goal, threshold=None, alt_model=None, use_random_policy_expl=None): super().__init__(env=env, model=model, nsteps=nsteps) assert isinstance( env.action_space, spaces.Discrete ), 'This ACER implementation works only with discrete action spaces!' assert isinstance(env, VecFrameStack) self.nact = env.action_space.n nenv = self.nenv self.nbatch = nenv * nsteps self.batch_ob_shape = (nenv * (nsteps + 1), ) + env.observation_space.shape # self.obs = env.reset() super method do this self.obs_dtype = env.observation_space.dtype self.obs_shape = env.observation_space.shape self.ac_dtype = env.action_space.dtype self.ac_shape = env.action_space.shape self.nstack = self.env.nstack self.nc = self.batch_ob_shape[-1] // self.nstack self.goal_shape = self.model.goal_shape self.goal_as_image = self.model.goal_as_image self.save_path = os.path.join(logger.get_dir(), "runner_data") self.store_data = store_data self.recorder = DataRecorder(self.save_path) self.dynamics = self.model.dynamics self.sample_goal = sample_goal self.threshold = threshold # self.batch_goal_feat_shape = (nenv*(nsteps+1),) + env.observation_space.shape + self.dynamics.feat_shape self.reached_status = np.array([False for _ in range(self.nenv)], dtype=bool) self.goals, self.goal_info = None, None self.reward_fn = reward_fn # self.results_writer = ResultsWriter(os.path.join(save_path, "evaluation.csv")) self.episode = np.ones(self.nenv) self.episode_step = np.zeros(self.nenv) self.episode_reached_step = np.zeros(self.nenv) self.episode_reward_to_go = np.zeros(self.nenv) self.name = self.model.scope.split("acer_")[1] self.alt_model = alt_model self.use_random_policy_expl = use_random_policy_expl if self.use_random_policy_expl: assert alt_model is not None def run(self, acer_step=None): if self.goals is None: self.goals, self.goal_info = self.dynamics.get_goal( nb_goal=self.nenv) if not self.goal_as_image: self.goals = goal_to_embedding(self.goal_info) mb_obs = np.empty((self.nenv, self.nsteps) + self.obs_shape, dtype=self.obs_dtype) mb_next_obs = np.empty((self.nenv, self.nsteps) + self.obs_shape, dtype=self.obs_dtype) mb_act = np.empty((self.nenv, self.nsteps) + self.ac_shape, dtype=self.ac_dtype) mb_mus = np.empty((self.nenv, self.nsteps, self.nact), dtype=np.float32) mb_ext_dones = np.empty((self.nenv, self.nsteps), dtype=bool) mb_int_dones = np.empty((self.nenv, self.nsteps), dtype=bool) mb_masks = np.empty((self.nenv, self.nsteps + 1), dtype=bool) mb_ext_rew = np.empty((self.nenv, self.nsteps), dtype=np.float32) mb_next_obs_infos = np.empty((self.nenv, self.nsteps), dtype=object) mb_goals = np.empty((self.nenv, self.nsteps) + self.goal_shape, dtype=self.obs_dtype) mb_goal_infos = np.empty((self.nenv, self.nsteps), dtype=object) reached_step, done_step = np.array([ None for _ in range(self.nenv) ]), np.array([None for _ in range(self.nenv)]) episode_infos = np.asarray([{} for _ in range(self.nenv)], dtype=object) for step in range(self.nsteps): check_obs(self.obs) actions, mus, states = self.model.step(self.obs, S=self.states, M=self.dones, goals=self.goals) if self.sample_goal: if self.use_random_policy_expl: actions[self.reached_status] = self.simple_random_action( np.sum(self.reached_status)) mus[self.reached_status] = self.get_mu_of_random_action() else: if np.sum(self.reached_status) > 0: alt_action, alt_mu, alt_states = self.alt_model.step( self.obs, S=self.states, M=self.dones, goals=self.goals) actions[self.reached_status] = alt_action[ self.reached_status] mus[self.reached_status] = alt_mu[self.reached_status] mb_obs[:, step] = deepcopy(self.obs) mb_act[:, step] = actions mb_mus[:, step, :] = mus mb_masks[:, step] = deepcopy(self.dones) obs, rewards, dones, infos = self.env.step(actions) check_infos(infos, self.recorder, dones, acer_step) for info in infos: info.update({"source": self.name}) mb_ext_dones[:, step] = dones mb_ext_rew[:, step] = rewards self.episode_reward_to_go[self.reached_status] += rewards[ self.reached_status] mb_next_obs[:, step] = self.get_real_next_obs(obs, dones, infos) mb_next_obs_infos[:, step] = np.asarray(infos, dtype=object) mb_goals[:, step] = deepcopy(self.goals) mb_goal_infos[:, step] = deepcopy(self.goal_info) self.episode_step += 1 # states information for statefull models like LSTM self.states = states self.dones = dones self.obs = obs # check reached reached_step = self.update_reach(reached_step, infos, step) # check done done_step[self.dones] = step # revise goal if not self.sample_goal: mb_goals, mb_goal_infos = self.update_goal_v1( mb_next_obs, mb_goals, infos, mb_goal_infos, done_step, step) else: mb_goals, mb_goal_infos = self.update_goal_v2( mb_next_obs, mb_goals, infos, mb_goal_infos, reached_step, step) # summary episode_infos = self.summary(episode_infos, infos, acer_step) mb_int_rewards = self.reward_fn(mb_next_obs_infos, mb_goal_infos) mb_int_dones.fill(False) int_dones_index = np.where(mb_int_rewards) mb_int_dones[int_dones_index] = True # shapes are adjusted to [nenv, nsteps, []] self.recorder.dump() results = dict( obs=mb_obs, next_obs=mb_next_obs, actions=mb_act, ext_rewards=mb_ext_rew, mus=mb_mus, ext_dones=mb_ext_dones, int_dones=mb_int_dones, masks=mb_masks, next_obs_infos= mb_next_obs_infos, # nenv, nsteps, two purpose: 1)put into dynamics; 2) put into buffer episode_infos=episode_infos, goal_obs=mb_goals, # nenv, nsteps+1, goal_infos=mb_goal_infos, int_rewards=mb_int_rewards) return results def simple_random_action(self, nb_action): return np.random.randint(0, self.env.action_space.n, nb_action) def get_mu_of_random_action(self): assert isinstance(self.env.action_space, spaces.Discrete) return np.array([ 1 / self.env.action_space.n for _ in range(self.env.action_space.n) ]) def initialize(self, init_steps): mb_obs, mb_actions, mb_next_obs, mb_goal_infos = [], [], [], [] for _ in range(init_steps): mb_obs.append(deepcopy(self.obs)) actions = np.asarray( [self.env.action_space.sample() for _ in range(self.nenv)]) self.obs, rewards, dones, infos = self.env.step(actions) goal_infos = np.array([{ "x_pos": info.get("x_pos", None), "y_pos": info.get("y_pos", None), "source": self.name } for info in infos], dtype=object) next_obs = self.get_real_next_obs(np.copy(self.obs), dones, infos) mb_next_obs.append(next_obs) mb_goal_infos.append(goal_infos) mb_actions.append(actions) mb_obs = np.asarray(mb_obs).swapaxes(1, 0) # (nenv, nstep, obs_shape) mb_goal_infos = np.asarray(mb_goal_infos, dtype=object).swapaxes( 1, 0) # (nenv, nstep, dict) mb_actions = np.asarray(mb_actions).swapaxes(1, 0) mb_next_obs = np.asarray(mb_next_obs).swapaxes(1, 0) mb_obs = mb_obs.reshape((-1, ) + mb_obs.shape[2:]) mb_goal_infos = mb_goal_infos.reshape(-1, ) mb_actions = mb_actions.reshape((-1, ) + mb_actions.shape[2:]) mb_next_obs = mb_next_obs.reshape((-1, ) + mb_next_obs.shape[2:]) for i in range(10): batch_size = min(64, init_steps) ind = np.random.randint(0, init_steps, batch_size) obs, actions, next_obs = mb_obs[ind], mb_actions[ind], mb_next_obs[ ind] nb_train_epoch = 1 self.model.train_dynamics(obs, actions, next_obs, nb_train_epoch) self.dynamics.put_goal(mb_obs, mb_actions, mb_next_obs, mb_goal_infos) self.obs = self.env.reset() def evaluate(self, nb_eval): assert self.dynamics.dummy goal_obs, goal_info = self.dynamics.get_goal( nb_goal=self.nenv) # (nenv, goal_dim) eval_info = {"l": 0, "r": 0, "x_pos": 0, "y_pos": 0} for i in range(nb_eval): terminal = False while True: actions, mus, states = self.model.step(self.obs, S=self.states, M=self.dones, goals=goal_obs) obs, rewards, dones, infos = self.env.step(actions) info = infos[0] if info.get("episode"): assert dones[0] eval_info["l"] += info.get("episode")["l"] eval_info["r"] += info.get("episode")["r"] eval_info["x_pos"] += info.get("x_pos") eval_info["y_pos"] += info.get("y_pos") terminal = True if terminal: break self.states = states self.dones = dones self.obs = obs self.obs = self.env.reset() for key in eval_info.keys(): eval_info[key] /= nb_eval return eval_info def log(self, mem): succ = "succ" if mem["is_succ"] else "fail" template = "env_{} {}|goal:{}|final_pos:{}|size:{}".format( mem["env"], succ, { "x_pos": mem["goal"]["x_pos"], "y_pos": mem["goal"]["y_pos"] }, mem["final_pos"], self.dynamics.queue.qsize()) logger.info(template) def summary(self, episode_infos, infos, acer_step): for env_idx in range(self.nenv): info = infos[env_idx] if self.dones[env_idx]: assert info.get("episode") if info.get("episode"): episode_infos[env_idx]["episode"] = info.get("episode") if not self.sample_goal: episode_infos[env_idx]["reached_info"] = dict( source=self.name, x_pos=infos[env_idx]["x_pos"], y_pos=infos[env_idx]["y_pos"]) else: if self.reached_status[env_idx]: reached = 1.0 time_ratio = self.episode_reached_step[ env_idx] / self.episode_step[env_idx] achieved_pos = { "x_pos": infos[env_idx]["x_pos"], "y_pos": infos[env_idx]["y_pos"] } mem = dict(env=env_idx, is_succ=True, goal=self.goal_info[env_idx], final_pos=achieved_pos, timestep=acer_step, episode=self.episode[env_idx], step=self.episode_step[env_idx]) self.recorder.store(mem) self.log(mem) abs_dist = 10 else: reached = 0.0 time_ratio = 1.0 achieved_pos = { "x_pos": infos[env_idx]["x_pos"], "y_pos": infos[env_idx]["y_pos"] } mem = dict(env=env_idx, is_succ=False, goal=self.goal_info[env_idx], final_pos=achieved_pos, timestep=acer_step, episode=self.episode[env_idx], step=self.episode_step[env_idx]) self.recorder.store(mem) self.log(mem) abs_dist = abs(float(infos[env_idx]["x_pos"]) - float(self.goal_info[env_idx]["x_pos"])) + \ abs(float(infos[env_idx]["y_pos"]) - float(self.goal_info[env_idx]["y_pos"])) episode_infos[env_idx]["reached_info"] = dict( reached=reached, time_ratio=time_ratio, abs_dist=abs_dist, source=self.name, x_pos=infos[env_idx]["x_pos"], y_pos=infos[env_idx]["y_pos"]) episode_infos[env_idx]["goal_info"] = dict( x_pos=self.goal_info[env_idx]["x_pos"], y_pos=self.goal_info[env_idx]["y_pos"], source=self.goal_info[env_idx]["source"], reward_to_go=self.episode_reward_to_go[env_idx]) # re-plan goal goal_obs, goal_info = self.dynamics.get_goal(nb_goal=1) if self.goal_as_image: self.goals[env_idx] = goal_obs[0] else: self.goals[env_idx] = goal_to_embedding(goal_info[0]) self.goal_info[env_idx] = goal_info[0] self.episode[env_idx] += 1 self.episode_step[env_idx] = 0 self.episode_reached_step[env_idx] = 0 self.reached_status[env_idx] = False self.episode_reward_to_go[env_idx] = 0 return episode_infos def get_real_next_obs(self, next_obs, dones, infos): _next_obs = next_obs.copy() for env_idx in range(self.nenv): if dones[env_idx]: o = infos[env_idx].get("next_obs", None) assert o is not None _next_obs[env_idx] = o return _next_obs def update_reach(self, reached_step, infos, step): if self.sample_goal: for env_idx in range(self.nenv): if not self.reached_status[env_idx]: self.reached_status[env_idx] = check_goal_reached( infos[env_idx], self.goal_info[env_idx], self.threshold) if self.reached_status[env_idx]: reached_step[env_idx] = step self.episode_reached_step[env_idx] = deepcopy( self.episode_step[env_idx]) return reached_step def update_goal_v1(self, mb_next_obs, mb_goals, infos, mb_goal_infos, done_step, step): assert not self.sample_goal for env_idx in range(self.nenv): if self.dones[env_idx]: # (- - done(t)) -> (done done, done(t)) start, end = 0, step + 1 if self.goal_as_image: mb_goals[env_idx, start:end] = mb_next_obs[env_idx, step] else: mb_goals[env_idx, start:end] = goal_to_embedding(infos[env_idx]) mb_goal_infos[env_idx, start:end] = infos[env_idx] elif step == self.nsteps - 1: if done_step[env_idx] is None: # (- - t) -> (t, t, t) start = 0 else: # (- - done - - t) -> (- - - t, t, t) start = done_step[env_idx] + 1 end = step + 1 if end == start: continue if self.goal_as_image: mb_goals[env_idx, start:end] = mb_next_obs[env_idx, step] else: mb_goals[env_idx, start:end] = goal_to_embedding(infos[env_idx]) mb_goal_infos[env_idx, start:end] = infos[env_idx] return mb_goals, mb_goal_infos def update_goal_v2(self, mb_next_obs, mb_goals, infos, mb_goal_infos, reached_step, step): assert self.sample_goal for env_idx in range(self.nenv): if step != self.nsteps - 1: # dones is instant variable but reached_status is a transitive variable if self.dones[env_idx] and self.reached_status[env_idx]: if reached_step[env_idx] is None: # reach|[- - done] -> [done, done, done] start, end = 0, step + 1 if self.goal_as_image: mb_goals[env_idx, start:end] = mb_next_obs[env_idx, step] else: mb_goals[env_idx, start:end] = goal_to_embedding( infos[env_idx]) mb_goal_infos[env_idx, start:end] = infos[env_idx] else: # [- - reach(done)] -> [ - - -] if reached_step[env_idx] == step # [- - reach - - done] -> [- - - done done done] start, end = reached_step[env_idx] + 1, step + 1 if end == start: continue if self.goal_as_image: mb_goals[env_idx, start:end] = mb_next_obs[env_idx, step] else: mb_goals[env_idx, start:end] = goal_to_embedding( infos[env_idx]) mb_goal_infos[env_idx, start:end] = infos[env_idx] elif not self.dones[env_idx] and self.reached_status[env_idx]: # reached|[ - - -] if reached_step[env_idx] is None: # [- - reached - -] if reached_step[env_idx] is not None pass else: # [- - - done] if self.dones[env_idx] and not self.reached_status[env_idx] # [- - - - -] if not self.dones[env_idx] and not self.reached_status[env_idx] pass else: if self.dones[env_idx] and self.reached_status[env_idx]: if reached_step[env_idx] is None: # reach|[- - done(t)] -> [done, done, done(t)] start, end = 0, step + 1 if self.goal_as_image: mb_goals[env_idx, start:end] = mb_next_obs[env_idx, step] else: mb_goals[env_idx, start:end] = goal_to_embedding( infos[env_idx]) mb_goal_infos[env_idx, start:end] = infos[env_idx] else: # [- - reach(done)(t)] -> [- - -] # [- - reach - - done(t)] -> [- - - done done done(t)] start, end = reached_step[env_idx] + 1, step + 1 if end == start: continue if self.goal_as_image: mb_goals[env_idx, start:end] = mb_next_obs[env_idx, step] else: mb_goals[env_idx, start:end] = goal_to_embedding( infos[env_idx]) mb_goal_infos[env_idx, start:end] = infos[env_idx] elif not self.dones[env_idx] and self.reached_status[env_idx]: if reached_step[env_idx] is None: # reached|[ - - t] -> reached|[t t t] start, end = 0, step + 1 else: # reached[- - r - -] -> reached|[- - - t t] start, end = reached_step[env_idx] + 1, step + 1 if end == start: continue if self.goal_as_image: mb_goals[env_idx, start:end] = mb_next_obs[env_idx, step] else: mb_goals[env_idx, start:end] = goal_to_embedding(infos[env_idx]) else: # [- - - done(t)] if self.dones[env_idx] and not self.reached_status[env_idx] # [- - - - (t)] if not self.dones[env_idx] and not self.reached_status[env_idx] pass return mb_goals, mb_goal_infos
def __init__(self, sess, env, auxiliary_task, queue_size, feat_dim, normalize_novelty): self.sess = sess self.dummy = False self.make_auxiliary_task = { "RF": RandomFeature, "IDF": InverseDynamics, "RND": RandomNetworkDistillation }[auxiliary_task.upper()] self.auxiliary_task = self.make_auxiliary_task(env, feat_dim) self.obs = self.auxiliary_task.obs self.next_obs = self.auxiliary_task.next_obs self.ac = self.auxiliary_task.ac self.ac_space = self.auxiliary_task.ac_space self.feat = tf.stop_gradient(self.auxiliary_task.feature) self.feat_shape = tuple(self.feat.get_shape().as_list()[1:]) self.feat_var = tf.reduce_mean(tf.nn.moments(self.feat, axes=-1)[1]) self.out_feat = tf.stop_gradient(self.auxiliary_task.next_feature) self.nenv = env.num_envs if hasattr(env, 'num_envs') else 1 with tf.variable_scope("dynamics"): self.novelty_tf = tf.placeholder(tf.float32, [None], "novelty_placeholder") if isinstance(self.auxiliary_task, RandomNetworkDistillation): self.dyna_loss = tf.zeros([]) self.novelty = self.auxiliary_task.get_novelty() elif isinstance(self.auxiliary_task, InverseDynamics) or isinstance( self.auxiliary_task, RandomFeature): with tf.variable_scope("loss"): self.novelty = self._get_novelty() self.dyna_loss = tf.reduce_mean(self.novelty) else: raise NotImplementedError self.dyna_params = tf.trainable_variables("dynamics") self.aux_params = tf.trainable_variables(self.auxiliary_task.scope) self.params = self.dyna_params + self.aux_params self.aux_loss = self.auxiliary_task.loss self.loss = self.aux_loss + self.dyna_loss self.queue = PriorityQueue(queue_size) self.novelty_rms = RunningMeanStd(epsilon=1e-4) self.novelty_normalized = tf.clip_by_value( (self.novelty_tf - self.novelty_rms.mean) / self.novelty_rms.std, -5., 5.) self.normalized = normalize_novelty if normalize_novelty: logger.info("normalize novelty") path = logger.get_dir() path = os.path.join(path, "goal_data") self.goal_recoder = DataRecorder(path) self.goal_store_baseline = 1500 self.density_estimate = deque(maxlen=int(1e4)) self.eval_interval = 20 self.eval_data_status = {} self.eval_data = [] path = logger.get_dir() self.eval_path = path = os.path.join(path, "novelty_evaluation") self.eval_recoder = DataRecorder(path) path = logger.get_dir() path = os.path.join(path, "error_goal") self.error_recoder = DataRecorder(path)
class Dynamics: def __init__(self, sess, env, auxiliary_task, queue_size, feat_dim, normalize_novelty): self.sess = sess self.dummy = False self.make_auxiliary_task = { "RF": RandomFeature, "IDF": InverseDynamics, "RND": RandomNetworkDistillation }[auxiliary_task.upper()] self.auxiliary_task = self.make_auxiliary_task(env, feat_dim) self.obs = self.auxiliary_task.obs self.next_obs = self.auxiliary_task.next_obs self.ac = self.auxiliary_task.ac self.ac_space = self.auxiliary_task.ac_space self.feat = tf.stop_gradient(self.auxiliary_task.feature) self.feat_shape = tuple(self.feat.get_shape().as_list()[1:]) self.feat_var = tf.reduce_mean(tf.nn.moments(self.feat, axes=-1)[1]) self.out_feat = tf.stop_gradient(self.auxiliary_task.next_feature) self.nenv = env.num_envs if hasattr(env, 'num_envs') else 1 with tf.variable_scope("dynamics"): self.novelty_tf = tf.placeholder(tf.float32, [None], "novelty_placeholder") if isinstance(self.auxiliary_task, RandomNetworkDistillation): self.dyna_loss = tf.zeros([]) self.novelty = self.auxiliary_task.get_novelty() elif isinstance(self.auxiliary_task, InverseDynamics) or isinstance( self.auxiliary_task, RandomFeature): with tf.variable_scope("loss"): self.novelty = self._get_novelty() self.dyna_loss = tf.reduce_mean(self.novelty) else: raise NotImplementedError self.dyna_params = tf.trainable_variables("dynamics") self.aux_params = tf.trainable_variables(self.auxiliary_task.scope) self.params = self.dyna_params + self.aux_params self.aux_loss = self.auxiliary_task.loss self.loss = self.aux_loss + self.dyna_loss self.queue = PriorityQueue(queue_size) self.novelty_rms = RunningMeanStd(epsilon=1e-4) self.novelty_normalized = tf.clip_by_value( (self.novelty_tf - self.novelty_rms.mean) / self.novelty_rms.std, -5., 5.) self.normalized = normalize_novelty if normalize_novelty: logger.info("normalize novelty") path = logger.get_dir() path = os.path.join(path, "goal_data") self.goal_recoder = DataRecorder(path) self.goal_store_baseline = 1500 self.density_estimate = deque(maxlen=int(1e4)) self.eval_interval = 20 self.eval_data_status = {} self.eval_data = [] path = logger.get_dir() self.eval_path = path = os.path.join(path, "novelty_evaluation") self.eval_recoder = DataRecorder(path) path = logger.get_dir() path = os.path.join(path, "error_goal") self.error_recoder = DataRecorder(path) def _get_novelty(self): if isinstance(self.ac_space, spaces.Box): assert len(self.ac_space.shape) == 1 elif isinstance(self.ac_space, spaces.Discrete): ac = tf.one_hot(self.ac, self.ac_space.n) elif isinstance(self.ac_space, spaces.MultiDiscrete): raise NotImplementedError elif isinstance(self.ac_space, spaces.MultiBinary): ac = tf.one_hot(self.ac, self.ac_space.n) else: raise NotImplementedError def add_ac(x): return tf.concat([x, ac], axis=-1) hidsize = 512 activ = tf.nn.leaky_relu x = fc(add_ac(self.feat), nh=hidsize, scope="fc_1") if activ is not None: x = activ(x) def residual(x, scope): res = fc(add_ac(x), nh=hidsize, scope=scope + "_1") res = tf.nn.leaky_relu(res) res = fc(add_ac(res), nh=hidsize, scope=scope + "_2") return x + res for _ in range(4): x = residual(x, scope="residual_{}".format(_ + 1)) n_out_features = self.out_feat.get_shape()[-1].value x = fc(add_ac(x), nh=n_out_features, scope="output") return tf.reduce_mean(tf.square(x - self.out_feat), axis=-1) def put_goal(self, obs, actions, next_obs, goal_infos): assert list(obs.shape)[1:] == self.obs.get_shape().as_list( )[1:], "obs shape:{}.please flatten obs".format(obs.shape) assert list(actions.shape)[1:] == self.ac.get_shape().as_list( )[1:], "action shape:{}.please flatten actions".format(actions.shape) assert list(next_obs.shape)[1:] == self.next_obs.get_shape().as_list( )[1:], "next obs shape:{}.please flatten obs".format(next_obs.shape) assert len(goal_infos.shape) == 1, "info shape:{}".format( goal_infos.shape) # sample goal according to x_pos x_pos = [info["x_pos"] for info in goal_infos] for index, x in enumerate(x_pos): seg = x // self.eval_interval * self.eval_interval if seg not in self.eval_data_status: self.eval_data_status[seg] = False self.density_estimate.append(x) if not self.eval_data_status[seg]: self.eval_data.append({ "obs": obs[index], "actions": actions[index], "next_obs": next_obs[index], "info": goal_infos[index] }) self.eval_data_status[seg] = True self.eval_data = sorted(self.eval_data, key=lambda y: y["info"]["x_pos"]) if np.max(x_pos) > self.goal_store_baseline: self.goal_recoder.store(self.eval_data) self.goal_recoder.dump() self.goal_store_baseline += 1000 logger.info("store {} goal.now baseline:{}".format( len(self.eval_data), self.goal_store_baseline)) # store goal into queue according to priority. novelty = self.sess.run(self.novelty, feed_dict={ self.obs: obs, self.next_obs: next_obs, self.ac: actions }) if self.normalized: self.novelty_rms.update(novelty) priority = -self.sess.run(self.novelty_normalized, feed_dict={self.novelty_tf: novelty}) else: priority = -novelty stats = self._add_goal(obs, actions, next_obs, goal_infos, priority) return stats def get_goal(self, nb_goal, replace=True, alpha=1.0, beta=0.95): assert self.queue.qsize() >= nb_goal goal_priority, goal_feat, goal_obs, goal_act, goal_next_obs, goal_info = [], [], [], [], [], [] while len(goal_obs) != nb_goal: data = self.queue.get() if (data[5]["x_pos"] <= 55) and (data[5]["y_pos"] <= 180): self.error_recoder.store(data) self.error_recoder.dump() logger.info("detecting an error goal:{} and remove it".format( data[5])) continue goal_priority.append(data[0]) goal_obs.append(data[2]) goal_act.append(data[3]) goal_next_obs.append(data[4]) goal_info.append(data[5]) goal_priority = np.asarray(goal_priority) # IMPORTANT: goal is next_obs in tuple. goals = np.asarray(goal_next_obs) if replace: goal_act = np.asarray(goal_act) goal_next_obs = np.asarray(goal_next_obs) novelty = self.sess.run(self.novelty, feed_dict={ self.obs: goal_obs, self.ac: goal_act, self.next_obs: goal_next_obs }) if self.normalized: self.novelty_rms.update(novelty) priority = -self.sess.run(self.novelty_normalized, feed_dict={self.novelty_tf: novelty}) else: priority = -novelty priority = (1 - alpha) * goal_priority * beta + alpha * priority self._add_goal(goal_obs, goal_act, goal_next_obs, goal_info, priority) assert list(goals.shape)[1:] == self.obs.get_shape().as_list( )[1:], "goal_obs:{}".format(goals.shape) return goals, goal_info def _add_goal(self, obs, actions, next_obs, infos, priority): baseline = None stats = dict() for i in range(len(priority)): if self.queue.qsize() < self.nenv * 5: data = (priority[i], time.time(), obs[i], actions[i], next_obs[i], infos[i]) self.queue.put(data) else: if baseline is None: queue_p = [-item[0] for item in self.queue.queue] stats["queue_max"], stats["queue_std"] = np.max( queue_p), np.std(queue_p) baseline = -0.75 * stats["queue_max"] if priority[i] < baseline: data = (priority[i], time.time(), obs[i], actions[i], next_obs[i], infos[i]) if self.queue.full(): maxvalue_idx = np.argmax( [item[0] for item in self.queue.queue]) self.queue.queue.pop(maxvalue_idx) self.queue.put(data) return stats def evaluate(self, steps, plot=False): if len(self.eval_data) > 0: obs, act, next_obs, x_pos = [], [], [], [] for i in range(len(self.eval_data)): obs.append(self.eval_data[i]["obs"]) act.append(self.eval_data[i]["actions"]) next_obs.append(self.eval_data[i]["next_obs"]) x_pos.append(self.eval_data[i]["info"]["x_pos"]) obs = np.asarray(obs, dtype=np.float32) act = np.asarray(act, dtype=np.float32) next_obs = np.asarray(next_obs, dtype=np.float32) x_pos = np.asarray(x_pos, dtype=np.float32) novelty = self.sess.run(self.novelty, feed_dict={ self.obs: obs, self.ac: act, self.next_obs: next_obs }) p = pearsonr(x_pos, novelty)[0] if plot: plt.figure(dpi=80) plt.subplot(2, 1, 1) plt.scatter(x_pos, novelty) plt.title("pos & novelty") plt.yscale("log") plt.subplot(2, 1, 2) density = np.array(self.density_estimate) sns.kdeplot(density) plt.title("sample density") plt.savefig( os.path.join(self.eval_path, "{}.png".format(steps))) plt.close() self.eval_recoder.store({ "x_pos": x_pos, "novelty": novelty, "p": p, "steps": steps }) self.eval_recoder.dump() return ["pos_novelty_p"], [p] else: return ["pos_novelty_p"], [np.nan]
class Runner(AbstractEnvRunner): def __init__(self, env, model, nsteps, store_data, reward_fn, sample_goal, dist_type, alt_model=None, use_random_policy_expl=None,): super().__init__(env=env, model=model, nsteps=nsteps) assert isinstance(env.action_space, spaces.Discrete), 'This ACER implementation works only with discrete action spaces!' assert isinstance(env, VecFrameStack) self.nact = env.action_space.n nenv = self.nenv self.nbatch = nenv * nsteps self.batch_ob_shape = (nenv * (nsteps + 1),) + env.observation_space.shape # self.obs = env.reset() super method do this self.obs_dtype = env.observation_space.dtype self.obs_shape = env.observation_space.shape self.ac_dtype = env.action_space.dtype self.ac_shape = env.action_space.shape self.nstack = self.env.nstack self.nc = self.batch_ob_shape[-1] // self.nstack self.goal_shape = self.model.goal_shape self.goal_as_image = self.model.goal_as_image self.save_path = os.path.join(logger.get_dir(), "runner_data") self.store_data = store_data self.recorder = DataRecorder(self.save_path) self.dynamics = self.model.dynamics self.sample_goal = sample_goal # self.batch_goal_feat_shape = (nenv*(nsteps+1),) + env.observation_space.shape + self.dynamics.feat_shape self.reached_status = np.array([False for _ in range(self.nenv)], dtype=bool) self.goals, self.goal_info = None, None self.reward_fn = reward_fn # self.results_writer = ResultsWriter(os.path.join(save_path, "evaluation.csv")) self.episode = np.ones(self.nenv) self.episode_step = np.zeros(self.nenv) self.episode_reached_step = np.zeros(self.nenv) self.episode_reward_to_go = np.zeros(self.nenv) self.name = self.model.scope.split("acer_")[1] assert dist_type in ["l1", "l2"] self.dist_type = dist_type self.alt_model = alt_model self.use_random_policy_expl = use_random_policy_expl if self.use_random_policy_expl: assert alt_model is not None def run(self, acer_step=None): if self.goals is None: self.goals, self.goal_info = self.dynamics.get_goal(nb_goal=self.nenv) if not self.goal_as_image: self.goals = self.goal_to_embedding(self.goal_info) # enc_obs = np.split(self.obs, self.nstack, axis=3) # so now list of obs steps enc_obs = np.split(self.env.stackedobs, self.env.nstack, axis=-1) mb_obs = np.empty((self.nenv, self.nsteps + 1) + self.obs_shape, dtype=self.obs_dtype) mb_act = np.empty((self.nenv, self.nsteps) + self.ac_shape, dtype=self.ac_dtype) mb_mus = np.empty((self.nenv, self.nsteps, self.nact), dtype=np.float32) mb_dones = np.empty((self.nenv, self.nsteps), dtype=bool) mb_masks = np.empty((self.nenv, self.nsteps + 1), dtype=bool) mb_ext_rew = np.empty((self.nenv, self.nsteps), dtype=np.float32) mb_obs_infos = np.empty((self.nenv, self.nsteps), dtype=object) mb_goals = np.empty((self.nenv, self.nsteps + 1) + self.goal_shape, dtype=self.obs_dtype) mb_goal_infos = np.empty((self.nenv, self.nsteps), dtype=object) # mb_obs, mb_actions, mb_mus, mb_dones, mb_ext_rewards = [], [], [], [], [] # mb_obs_infos, mb_goals, mb_goal_infos = [], [], [] reached_step, done_step = np.array([None for _ in range(self.nenv)]), np.array([None for _ in range(self.nenv)]) episode_infos = np.asarray([{} for _ in range(self.nenv)], dtype=object) for step in range(self.nsteps): try: check_obs(self.obs) except ValueError: logger.warn("acer_step:{}, runner_step:{}, empty obs".format(acer_step, step)) raise ValueError actions, mus, states = self.model.step(self.obs, S=self.states, M=self.dones, goals=self.goals) if self.sample_goal: if self.use_random_policy_expl: actions[self.reached_status] = self.simple_random_action(np.sum(self.reached_status)) mus[self.reached_status] = self.get_mu_of_random_action() else: if np.sum(self.reached_status) > 0: alt_action, alt_mu, alt_states = self.alt_model.step(self.obs, S=self.states, M=self.dones, goals=self.goals) actions[self.reached_status] = alt_action[self.reached_status] mus[self.reached_status] = alt_mu[self.reached_status] mb_obs[:, step] = deepcopy(self.obs) mb_act[:, step] = actions mb_mus[:, step, :] = mus mb_masks[:, step] = deepcopy(self.dones) obs, rewards, dones, infos = self.env.step(actions) try: check_infos(infos) except ValueError: logger.warn("warning!wrong infos!program continues anyway") logger.info("infos:{}, dones:{}, acer_step:{}".format(infos, dones, acer_step)) logger.info("please debug it in runner_data/data.pkl") self.recorder.store(infos) self.recorder.dump() for info in infos: info.update({"source": self.name}) enc_obs.append(obs[..., -self.nc:]) mb_dones[:, step] = dones mb_ext_rew[:, step] = rewards self.episode_reward_to_go[self.reached_status] += rewards[self.reached_status] mb_obs_infos[:, step] = np.asarray(infos, dtype=object) mb_goals[:, step] = deepcopy(self.goals) mb_goal_infos[:, step] = deepcopy(self.goal_info) self.episode_step += 1 # states information for statefull models like LSTM self.states = states self.dones = dones self.obs = obs # check reached if self.sample_goal: for env_idx in range(self.nenv): if not self.reached_status[env_idx]: if self.dist_type == "l1": self.reached_status[env_idx] = self.check_goal_reached_v2(infos[env_idx], self.goal_info[env_idx]) else: raise NotImplementedError("I do not know how to compute goal_latent") if self.reached_status[env_idx]: reached_step[env_idx] = step self.episode_reached_step[env_idx] = deepcopy(self.episode_step[env_idx]) # check done done_step[self.dones] = step # revise goal if not self.sample_goal: for env_idx in range(self.nenv): if self.dones[env_idx]: # (- - done(t)) -> (done done, done(t)) start, end = 0, step + 1 if self.goal_as_image: mb_goals[env_idx, start:end] = mb_obs[env_idx, step] else: mb_goals[env_idx, start:end] = self.goal_to_embedding(infos[env_idx]) mb_goal_infos[env_idx, start:end] = infos[env_idx] elif step == self.nsteps - 1: if done_step[env_idx] is None: # (- - t) -> (t, t, t) start = 0 else: # (- - done - - t) -> (- - - t, t, t) start = done_step[env_idx] + 1 end = step + 1 if end == start: continue if self.goal_as_image: mb_goals[env_idx, start:end] = mb_obs[env_idx, step] else: mb_goals[env_idx, start:end] = self.goal_to_embedding(infos[env_idx]) mb_goal_infos[env_idx, start:end] = infos[env_idx] else: for env_idx in range(self.nenv): if step != self.nsteps - 1: # dones is instant variable but reached_status is a transitive variable if self.dones[env_idx] and self.reached_status[env_idx]: if reached_step[env_idx] is None: # reach|[- - done] -> [done, done, done] start, end = 0, step + 1 if self.goal_as_image: mb_goals[env_idx, start:end] = mb_obs[env_idx, step] else: mb_goals[env_idx, start:end] = self.goal_to_embedding(infos[env_idx]) mb_goal_infos[env_idx, start:end] = infos[env_idx] else: # [- - reach(done)] -> [ - - -] if reached_step[env_idx] == step # [- - reach - - done] -> [- - - done done done] start, end = reached_step[env_idx] + 1, step + 1 if end == start: continue if self.goal_as_image: mb_goals[env_idx, start:end] = mb_obs[env_idx, step] else: mb_goals[env_idx, start:end] = self.goal_to_embedding(infos[env_idx]) mb_goal_infos[env_idx, start:end] = infos[env_idx] elif not self.dones[env_idx] and self.reached_status[env_idx]: # reached|[ - - -] if reached_step[env_idx] is None: # [- - reached - -] if reached_step[env_idx] is not None pass else: # [- - - done] if self.dones[env_idx] and not self.reached_status[env_idx] # [- - - - -] if not self.dones[env_idx] and not self.reached_status[env_idx] pass else: if self.dones[env_idx] and self.reached_status[env_idx]: if reached_step[env_idx] is None: # reach|[- - done(t)] -> [done, done, done(t)] start, end = 0, step + 1 if self.goal_as_image: mb_goals[env_idx, start:end] = mb_obs[env_idx, step] else: mb_goals[env_idx, start:end] = self.goal_to_embedding(infos[env_idx]) mb_goal_infos[env_idx, start:end] = infos[env_idx] else: # [- - reach(done)(t)] -> [- - -] # [- - reach - - done(t)] -> [- - - done done done(t)] start, end = reached_step[env_idx] + 1, step + 1 if end == start: continue if self.goal_as_image: mb_goals[env_idx, start:end] = mb_obs[env_idx, step] else: mb_goals[env_idx, start:end] = self.goal_to_embedding(infos[env_idx]) mb_goal_infos[env_idx, start:end] = infos[env_idx] elif not self.dones[env_idx] and self.reached_status[env_idx]: if reached_step[env_idx] is None: # reached|[ - - t] -> reached|[t t t] start, end = 0, step + 1 else: # reached[- - r - -] -> reached|[- - - t t] start, end = reached_step[env_idx] + 1, step + 1 if end == start: continue if self.goal_as_image: mb_goals[env_idx, start:end] = mb_obs[env_idx, step] else: mb_goals[env_idx, start:end] = self.goal_to_embedding(infos[env_idx]) else: # [- - - done(t)] if self.dones[env_idx] and not self.reached_status[env_idx] # [- - - - (t)] if not self.dones[env_idx] and not self.reached_status[env_idx] pass # summary for env_idx in range(self.nenv): info = infos[env_idx] if self.dones[env_idx]: assert info.get("episode") if info.get("episode"): episode_infos[env_idx]["episode"] = info.get("episode") if not self.sample_goal: episode_infos[env_idx]["reached_info"] = dict(source=self.name, x_pos=infos[env_idx]["x_pos"], y_pos=infos[env_idx]["y_pos"]) else: if self.reached_status[env_idx]: reached = 1.0 time_ratio = self.episode_reached_step[env_idx] / self.episode_step[env_idx] achieved_pos = {"x_pos": infos[env_idx]["x_pos"], "y_pos": infos[env_idx]["y_pos"]} mem = dict(env=env_idx, is_succ=True, goal=self.goal_info[env_idx], final_pos=achieved_pos, timestep=acer_step, episode=self.episode[env_idx], step=self.episode_step[env_idx]) self.recorder.store(mem) self.log(mem) abs_dist = 10 else: reached = 0.0 time_ratio = 1.0 achieved_pos = {"x_pos": infos[env_idx]["x_pos"], "y_pos": infos[env_idx]["y_pos"]} mem = dict(env=env_idx, is_succ=False, goal=self.goal_info[env_idx], final_pos=achieved_pos, timestep=acer_step, episode=self.episode[env_idx], step=self.episode_step[env_idx]) self.recorder.store(mem) self.log(mem) abs_dist = abs(float(infos[env_idx]["x_pos"]) - float(self.goal_info[env_idx]["x_pos"])) + \ abs(float(infos[env_idx]["y_pos"]) - float(self.goal_info[env_idx]["y_pos"])) episode_infos[env_idx]["reached_info"] = dict(reached=reached, time_ratio=time_ratio, abs_dist=abs_dist, source=self.name, x_pos=infos[env_idx]["x_pos"], y_pos=infos[env_idx]["y_pos"]) episode_infos[env_idx]["goal_info"] = dict(x_pos=self.goal_info[env_idx]["x_pos"], y_pos=self.goal_info[env_idx]["y_pos"], source=self.goal_info[env_idx]["source"], reward_to_go=self.episode_reward_to_go[env_idx]) # re-plan goal goal_obs, goal_info = self.dynamics.get_goal(nb_goal=1) if self.goal_as_image: self.goals[env_idx] = goal_obs[0] else: self.goals[env_idx] = self.goal_to_embedding(goal_info[0]) self.goal_info[env_idx] = goal_info[0] self.episode[env_idx] += 1 self.episode_step[env_idx] = 0 self.episode_reached_step[env_idx] = 0 self.reached_status[env_idx] = False self.episode_reward_to_go[env_idx] = 0 # next obs and next goal mb_obs[:, -1] = deepcopy(self.obs) mb_goals[:, -1] = mb_goals[:, -2] # we cannot use self.goal since it way be revised if self.dist_type == "l2": raise NotImplementedError else: mb_int_rewards = self.reward_fn(mb_obs_infos, mb_goal_infos) # shapes are adjusted to [nenv, nsteps, []] enc_obs = np.asarray(enc_obs, dtype=self.obs_dtype).swapaxes(1, 0) self.recorder.dump() results = dict( enc_obs=enc_obs, obs=mb_obs, actions=mb_act, ext_rewards=mb_ext_rew, mus=mb_mus, dones=mb_dones, masks=mb_masks, obs_infos=mb_obs_infos, # nenv, nsteps, two purpose: 1)put into dynamics; 2) put into buffer episode_infos=episode_infos, goal_obs=mb_goals, # nenv, nsteps+1, goal_infos=mb_goal_infos, int_rewards=mb_int_rewards ) return results def check_goal_reached(self, obs_feat, desired_goal): assert obs_feat.shape == desired_goal.shape assert len(obs_feat.shape) == 1 if self.dynamics.dummy: return False else: eps = 1e-6 tol = 0.03 status = (np.square(obs_feat - desired_goal).sum() / (np.square(desired_goal).sum() + eps)) < tol return status @staticmethod def check_goal_reached_v2(obs_info, goal_info): eps = 20 obs_x, obs_y = float(obs_info["x_pos"]), float(obs_info["y_pos"]) goal_x, goal_y = float(goal_info["x_pos"]), float(goal_info["y_pos"]) dist = abs(obs_x - goal_x) + abs(obs_y - goal_y) if dist < eps: status = True else: status = False return status def simple_random_action(self, nb_action): return np.random.randint(0, self.env.action_space.n, nb_action) def get_mu_of_random_action(self): assert isinstance(self.env.action_space, spaces.Discrete) return np.array([1 / self.env.action_space.n for _ in range(self.env.action_space.n)]) @staticmethod def goal_to_embedding(goal_infos): feat_dim = 512 nb_tile = feat_dim // 2 if isinstance(goal_infos, dict): goal_embedding = np.array([goal_infos["x_pos"], goal_infos["y_pos"]], dtype=np.float32).reshape(1, 2) goal_embedding = np.tile(goal_embedding, [1]*len(goal_embedding.shape[:-1])+[nb_tile]) return goal_embedding def get_pos(x): return float(x["x_pos"]), float(x["y_pos"]) vf = np.vectorize(get_pos) goal_pos = vf(goal_infos) goal_x, goal_y = np.expand_dims(goal_pos[0], -1).astype(np.float32), np.expand_dims(goal_pos[1], -1).astype(np.float32) goal_embedding = np.concatenate([goal_x, goal_y], axis=-1) goal_embedding = np.tile(goal_embedding, [1]*len(goal_embedding.shape[:-1])+[nb_tile]) return goal_embedding def initialize(self, init_steps): mb_obs, mb_actions, mb_next_obs, mb_goal_infos = [], [], [], [] for _ in range(init_steps): mb_obs.append(deepcopy(self.obs)) actions = np.asarray([self.env.action_space.sample() for _ in range(self.nenv)]) self.obs, rewards, dones, infos = self.env.step(actions) goal_infos = np.array([{"x_pos": info.get("x_pos", None), "y_pos": info.get("y_pos", None), "source": self.name} for info in infos], dtype=object) mb_goal_infos.append(goal_infos) mb_actions.append(actions) mb_next_obs.append(deepcopy(self.obs)) mb_obs = np.asarray(mb_obs).swapaxes(1, 0) # (nenv, nstep, obs_shape) mb_goal_infos = np.asarray(mb_goal_infos, dtype=object).swapaxes(1, 0) # (nenv, nstep, dict) mb_actions = np.asarray(mb_actions).swapaxes(1, 0) mb_next_obs = np.asarray(mb_next_obs).swapaxes(1, 0) batch_size = min(128, init_steps) ind = np.random.randint(0, init_steps, batch_size) mb_obs = mb_obs.reshape((-1,) + mb_obs.shape[2:])[ind] mb_goal_infos = mb_goal_infos.reshape(-1, )[ind] mb_actions = mb_actions.reshape((-1,) + mb_actions.shape[2:])[ind] mb_next_obs = mb_next_obs.reshape((-1,) + mb_next_obs.shape[2:])[ind] for i in range(10): self.model.train_dynamics(mb_obs, mb_actions, mb_next_obs, 0) self.dynamics.put_goal(mb_obs, mb_actions, mb_next_obs, mb_goal_infos) self.obs = self.env.reset() def evaluate(self, nb_eval): assert self.dynamics.dummy goal_obs, goal_info = self.dynamics.get_goal(nb_goal=self.nenv) # (nenv, goal_dim) eval_info = {"l": 0, "r": 0} for i in range(nb_eval): terminal = False while True: actions, mus, states = self.model.step(self.obs, S=self.states, M=self.dones, goals=goal_obs) obs, rewards, dones, infos = self.env.step(actions) info = infos[0] if info.get("episode"): assert dones[0] eval_info["l"] += info.get("episode")["l"] eval_info["r"] += info.get("episode")["r"] terminal = True if terminal: break self.states = states self.dones = dones self.obs = obs self.obs = self.env.reset() eval_info["l"] /= nb_eval eval_info["r"] /= nb_eval return eval_info def log(self, mem): succ = "succ" if mem["is_succ"] else "fail" template = "env_{} {}|goal:{}|final_pos:{}|size:{}".format( mem["env"], succ, {"x_pos": mem["goal"]["x_pos"], "y_pos": mem["goal"]["y_pos"]}, mem["final_pos"], self.dynamics.queue.qsize() ) logger.info(template)