示例#1
0
    def train(self, data, rollout_size, online=True, save_path=None):
        """
        :param data: list of dict. [{"obs": arr; "next_obs": arr}]
        """
        self.recoder = DataRecorder(os.path.join(save_path, "training"))
        for episode, episode_data in enumerate(data):
            episode_length = len(episode_data["obs"])
            obs, act, next_obs, x_pos = episode_data["obs"], episode_data["act"], episode_data["next_obs"], episode_data["x_pos"]
            episode_novelty = []
            if not online:
                ind = np.random.permutation(episode_length)
                obs, act, next_obs, x_pos = obs[ind], act[ind], next_obs[ind], x_pos[ind]

            for start in range(0, episode_length, rollout_size):
                end = start + rollout_size
                batch_obs, batch_act, batch_next_obs, batch_x_pos = obs[start:end], act[start:end], next_obs[start:end], x_pos[start:end]

                novelty = self.sess.run(self.dynamics.novelty, feed_dict={self.dynamics.obs: obs,
                                                                          self.dynamics.ac: act,
                                                                          self.dynamics.next_obs: next_obs})
                self.sess.run(self.train_op, feed_dict={self.dynamics.obs: batch_obs, self.dynamics.ac: batch_act,
                                                        self.dynamics.next_obs: batch_next_obs})
                p = pearsonr(x_pos, novelty)[0]
                logger.info("Episode:{}|Epoch:{}|P:{}".format(episode, start//rollout_size, p))
                episode_novelty.append(novelty)
                self.recoder.store({"x_pos": x_pos, "novelty": novelty, "episode": episode, "epoch": start//rollout_size,
                                    "p": p})
                plt.figure()
                plt.scatter(x_pos, novelty)
                # plt.yscale("log")
                plt.savefig(os.path.join(save_path, "{}_{}.png".format(episode, start//rollout_size)))
                plt.close()
            self.recoder.dump()
示例#2
0
    def __init__(self,
                 env,
                 model,
                 nsteps,
                 store_data,
                 reward_fn,
                 sample_goal,
                 threshold=None,
                 alt_model=None,
                 use_random_policy_expl=None):
        super().__init__(env=env, model=model, nsteps=nsteps)
        assert isinstance(
            env.action_space, spaces.Discrete
        ), 'This ACER implementation works only with discrete action spaces!'
        assert isinstance(env, VecFrameStack)

        self.nact = env.action_space.n
        nenv = self.nenv
        self.nbatch = nenv * nsteps
        self.batch_ob_shape = (nenv *
                               (nsteps + 1), ) + env.observation_space.shape

        # self.obs = env.reset()  super method do this
        self.obs_dtype = env.observation_space.dtype
        self.obs_shape = env.observation_space.shape
        self.ac_dtype = env.action_space.dtype
        self.ac_shape = env.action_space.shape
        self.nstack = self.env.nstack
        self.nc = self.batch_ob_shape[-1] // self.nstack
        self.goal_shape = self.model.goal_shape
        self.goal_as_image = self.model.goal_as_image

        self.save_path = os.path.join(logger.get_dir(), "runner_data")
        self.store_data = store_data
        self.recorder = DataRecorder(self.save_path)

        self.dynamics = self.model.dynamics
        self.sample_goal = sample_goal
        self.threshold = threshold
        # self.batch_goal_feat_shape = (nenv*(nsteps+1),) + env.observation_space.shape + self.dynamics.feat_shape
        self.reached_status = np.array([False for _ in range(self.nenv)],
                                       dtype=bool)
        self.goals, self.goal_info = None, None
        self.reward_fn = reward_fn
        # self.results_writer = ResultsWriter(os.path.join(save_path, "evaluation.csv"))

        self.episode = np.ones(self.nenv)
        self.episode_step = np.zeros(self.nenv)
        self.episode_reached_step = np.zeros(self.nenv)
        self.episode_reward_to_go = np.zeros(self.nenv)

        self.name = self.model.scope.split("acer_")[1]

        self.alt_model = alt_model
        self.use_random_policy_expl = use_random_policy_expl
        if self.use_random_policy_expl:
            assert alt_model is not None
示例#3
0
 def __init__(self, *, env, model, nsteps, gamma, lam, save_path,
              store_data):
     super().__init__(env=env, model=model, nsteps=nsteps)
     # Lambda used in GAE (General Advantage Estimation)
     self.lam = lam
     # Discount rate
     self.gamma = gamma
     self.recorder = DataRecorder(save_path)
     self.episode = np.zeros(self.nenv)
     self.timestamp = np.zeros(self.nenv)
     self.store_data = store_data
示例#4
0
class Model:
    def __init__(self, sess, env, aux_task, feat_dim, lr):
        self.sess = sess or tf.Session()

        self.dynamics = Dynamics(sess=self.sess, env=env, auxiliary_task=aux_task, feat_dim=feat_dim,
                                 queue_size=1000, normalize_novelty=True)

        self.obs_shape = env.observation_space.shape
        self.ac_shape = env.action_space.shape
        del env
        self.opt = tf.train.RMSPropOptimizer(lr, decay=0.99)
        self.aux_loss = self.dynamics.aux_loss
        self.dyna_loss = self.dynamics.dyna_loss
        self.loss = self.aux_loss + self.dyna_loss

        params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
        gradsandvars = self.opt.compute_gradients(self.loss, params)
        self.train_op = self.opt.apply_gradients(gradsandvars)

        self.train_history = []

    def train(self, data, rollout_size, online=True, save_path=None):
        """
        :param data: list of dict. [{"obs": arr; "next_obs": arr}]
        """
        self.recoder = DataRecorder(os.path.join(save_path, "training"))
        for episode, episode_data in enumerate(data):
            episode_length = len(episode_data["obs"])
            obs, act, next_obs, x_pos = episode_data["obs"], episode_data["act"], episode_data["next_obs"], episode_data["x_pos"]
            episode_novelty = []
            if not online:
                ind = np.random.permutation(episode_length)
                obs, act, next_obs, x_pos = obs[ind], act[ind], next_obs[ind], x_pos[ind]

            for start in range(0, episode_length, rollout_size):
                end = start + rollout_size
                batch_obs, batch_act, batch_next_obs, batch_x_pos = obs[start:end], act[start:end], next_obs[start:end], x_pos[start:end]

                novelty = self.sess.run(self.dynamics.novelty, feed_dict={self.dynamics.obs: obs,
                                                                          self.dynamics.ac: act,
                                                                          self.dynamics.next_obs: next_obs})
                self.sess.run(self.train_op, feed_dict={self.dynamics.obs: batch_obs, self.dynamics.ac: batch_act,
                                                        self.dynamics.next_obs: batch_next_obs})
                p = pearsonr(x_pos, novelty)[0]
                logger.info("Episode:{}|Epoch:{}|P:{}".format(episode, start//rollout_size, p))
                episode_novelty.append(novelty)
                self.recoder.store({"x_pos": x_pos, "novelty": novelty, "episode": episode, "epoch": start//rollout_size,
                                    "p": p})
                plt.figure()
                plt.scatter(x_pos, novelty)
                # plt.yscale("log")
                plt.savefig(os.path.join(save_path, "{}_{}.png".format(episode, start//rollout_size)))
                plt.close()
            self.recoder.dump()
示例#5
0
    def __init__(self, runner, model, buffer, log_interval):
        self.runner = runner
        self.model = model
        self.buffer = buffer
        self.log_interval = log_interval
        self.tstart = None
        self.keys = ["episode_return", "episode_length", "rewards", "her_gain"]
        self.episode_stats = EpisodeStats(maxlen=10, keys=self.keys)
        self.steps = 0
        self.save_interval = self.runner.save_interval
        self.recoder = DataRecorder(os.path.join(logger.get_dir(), "samples"))

        sess = self.model.sess
        self.save = functools.partial(save_variables,
                                      sess=sess,
                                      variables=self.model.params)
示例#6
0
文件: runner.py 项目: liziniu/Maze
    def __init__(self, env, model, nsteps, save_interval):
        super().__init__(env=env, model=model, nsteps=nsteps)
        assert isinstance(
            env.action_space, spaces.Discrete
        ), 'This ACER implementation works only with discrete action spaces!'

        self.nact = env.action_space.n
        nenv = self.nenv
        self.nbatch = nenv * nsteps
        self.batch_ob_shape = (nenv *
                               (nsteps + 1), ) + env.observation_space.shape

        # self.obs = env.reset()
        self.obs_dtype = env.observation_space.dtype
        self.obs_shape = env.observation_space.shape
        self.ac_dtype = env.action_space.dtype

        self.recoder = DataRecorder(
            os.path.join(logger.get_dir(), "runner_data"))
        self.save_interval = save_interval

        self.size = [int(x) for x in self.env.spec.id.split("-")[2].split("x")]
        self.desired_pos = np.asarray(self.size) - 1
        logger.info("-" * 50)
        logger.info("-" * 15, "desired_pos:", self.desired_pos, "-" * 15)
        logger.info("-" * 50)

        self.goals, self.goal_infos = self.get_goal(self.nenv)
        self.episode_step = np.zeros(self.nenv, dtype=np.int32)
        self.episode = np.zeros(self.nenv, dtype=np.int32)
示例#7
0
文件: runner2.py 项目: liziniu/Maze
    def __init__(self, env, model, nsteps, total_steps, save_interval, her):
        super().__init__(env=env, model=model, nsteps=nsteps)
        assert isinstance(
            env.action_space, spaces.Discrete
        ), 'This ACER implementation works only with discrete action spaces!'

        self.nact = env.action_space.n
        nenv = self.nenv
        self.nbatch = nenv * nsteps
        self.batch_ob_shape = (nenv *
                               (nsteps + 1), ) + env.observation_space.shape

        # self.obs = env.reset()
        self.obs_dtype = env.observation_space.dtype
        self.obs_shape = env.observation_space.shape
        self.ac_dtype = env.action_space.dtype

        self.recoder = DataRecorder(
            os.path.join(logger.get_dir(), "runner_data"))
        self.save_interval = save_interval

        self.total_steps = total_steps

        self.maze_shape = [
            int(x) for x in self.env.spec.id.split("-")[2].split("x")
        ]
        self.desired_pos = np.asarray(self.maze_shape) - 1
        logger.info("-" * 50)
        logger.info("-" * 15, "desired_pos:", self.desired_pos, "-" * 15)
        logger.info("-" * 50)

        self.her = her

        assert self.nenv == 1
        self.controller = MetaController(self.maze_shape,
                                         env.observation_space.shape,
                                         env.observation_space.dtype)
        self.goal_infos = [{} for _ in range(self.nenv)]
        self.goals = [self.controller.sample_goal() for _ in range(self.nenv)]
        self.mem = ""

        self.episode_step = np.zeros(self.nenv, dtype=np.int32)
        self.episode = np.zeros(self.nenv, dtype=np.int32)

        self.max_episode_length = 1000
示例#8
0
    def __init__(self, env, model, nsteps, total_steps, save_interval, her):
        super().__init__(env=env, model=model, nsteps=nsteps)
        assert isinstance(env.action_space, spaces.Discrete), 'This ACER implementation works only with discrete action spaces!'

        self.nact = env.action_space.n
        nenv = self.nenv
        self.nbatch = nenv * nsteps
        self.batch_ob_shape = (nenv*(nsteps+1),) + env.observation_space.shape

        # self.obs = env.reset()
        self.obs_dtype = env.observation_space.dtype
        self.obs_shape = env.observation_space.shape
        self.ac_dtype = env.action_space.dtype
        
        self.recoder = DataRecorder(os.path.join(logger.get_dir(), "runner_data"))
        self.save_interval = save_interval

        self.total_steps = total_steps

        self.maze_shape = [int(x) for x in self.env.spec.id.split("-")[2].split("x")]
        self.desired_pos = arr_to_one_hot(np.asarray(self.maze_shape) - 1, ncat=self.maze_shape[0])
        logger.info("-"*50)
        logger.info("-"*15, "desired_pos:", self.desired_pos, "-"*15)
        logger.info("-"*50)

        self.her = her

        assert self.nenv == 1
        self.controller = MetaController(self.maze_shape, env.observation_space.shape, env.observation_space.dtype)
        self.allowed_step = [np.prod(self.maze_shape)*10]
        self.allowed_step = [np.inf]
        self.goal_infos = [{}]
        self.goals = np.array([self.controller.sample_goal()])
        self.aux_goal = np.copy(self.goals[0])
        self.mem = ""

        self.episode_step = np.zeros(self.nenv, dtype=np.int32)
        self.episode = np.zeros(self.nenv, dtype=np.int32)
        self.aux_step = np.zeros(self.nenv, dtype=np.int32)
        self.aux_dones = np.empty(self.nenv, dtype=bool)
        self.max_episode_length = 3000
        self.aux_dones.fill(False)
        self.aux_entropy = 0.
        self.tar_entropy = 0.
示例#9
0
class Runner(AbstractEnvRunner):
    """
    We use this object to make a mini batch of experiences
    __init__:
    - Initialize the runner

    run():
    - Make a mini batch
    """
    def __init__(self, *, env, model, nsteps, gamma, lam, save_path,
                 store_data):
        super().__init__(env=env, model=model, nsteps=nsteps)
        # Lambda used in GAE (General Advantage Estimation)
        self.lam = lam
        # Discount rate
        self.gamma = gamma
        self.recorder = DataRecorder(save_path)
        self.episode = np.zeros(self.nenv)
        self.timestamp = np.zeros(self.nenv)
        self.store_data = store_data

    def run(self):
        # Here, we init the lists that will contain the mb of experiences
        mb_obs, mb_rewards, mb_actions, mb_values, mb_dones, mb_neglogpacs = [],[],[],[],[],[]
        mb_states = self.states
        epinfos = []
        # For n in range number of steps
        for _ in range(self.nsteps):
            # Given observations, get action value and neglopacs
            # We already have self.obs because Runner superclass run self.obs[:] = env.reset() on init
            obs_tmp = self.obs.copy()
            actions, values, self.states, neglogpacs = self.model.step(
                self.obs, S=self.states, M=self.dones)
            mb_obs.append(self.obs.copy())
            mb_actions.append(actions)
            mb_values.append(values)
            mb_neglogpacs.append(neglogpacs)
            mb_dones.append(self.dones)

            # Take actions in env and look the results
            # Infos contains a ton of useful informations
            self.obs[:], rewards, self.dones, infos = self.env.step(actions)
            self.timestamp += 1
            # todo: add x,y,obs to pkl file.
            for env_idx, info in enumerate(infos):
                maybeepinfo = info.get('episode')
                if self.store_data:
                    data = dict(episode=self.episode[env_idx],
                                timestamp=self.timestamp[env_idx],
                                x_pos=info["x_pos"],
                                y_pos=info["y_pos"],
                                obs=obs_tmp[env_idx],
                                next_obs=self.obs[env_idx],
                                act=actions[env_idx],
                                value=values[env_idx])
                    self.recorder.store(data)
                if maybeepinfo:
                    epinfos.append(maybeepinfo)
                    if self.store_data:
                        self.episode[env_idx] += 1
                        self.timestamp[env_idx] = 0
                        self.recorder.dump()
            mb_rewards.append(rewards)
        #batch of steps to batch of rollouts
        mb_obs = np.asarray(mb_obs, dtype=self.obs.dtype)
        mb_rewards = np.asarray(mb_rewards, dtype=np.float32)
        mb_actions = np.asarray(mb_actions)
        mb_values = np.asarray(mb_values, dtype=np.float32)
        mb_neglogpacs = np.asarray(mb_neglogpacs, dtype=np.float32)
        mb_dones = np.asarray(mb_dones, dtype=np.bool)
        last_values = self.model.value(self.obs, S=self.states, M=self.dones)

        # discount/bootstrap off value fn
        mb_returns = np.zeros_like(mb_rewards)
        mb_advs = np.zeros_like(mb_rewards)
        lastgaelam = 0
        for t in reversed(range(self.nsteps)):
            if t == self.nsteps - 1:
                nextnonterminal = 1.0 - self.dones
                nextvalues = last_values
            else:
                nextnonterminal = 1.0 - mb_dones[t + 1]
                nextvalues = mb_values[t + 1]
            delta = mb_rewards[
                t] + self.gamma * nextvalues * nextnonterminal - mb_values[t]
            mb_advs[
                t] = lastgaelam = delta + self.gamma * self.lam * nextnonterminal * lastgaelam
        mb_returns = mb_advs + mb_values
        return (*map(sf01, (mb_obs, mb_returns, mb_dones, mb_actions,
                            mb_values, mb_neglogpacs)), mb_states, epinfos)
示例#10
0
class Acer:
    def __init__(self, runner, model, buffer, log_interval):
        self.runner = runner
        self.model = model
        self.buffer = buffer
        self.log_interval = log_interval
        self.tstart = None
        self.keys = ["episode_return", "episode_length", "rewards", "her_gain"]
        self.episode_stats = EpisodeStats(maxlen=10, keys=self.keys)
        self.steps = 0
        self.save_interval = self.runner.save_interval
        self.recoder = DataRecorder(os.path.join(logger.get_dir(), "samples"))

        sess = self.model.sess
        self.save = functools.partial(save_variables,
                                      sess=sess,
                                      variables=self.model.params)

    def call(self, replay_start, nb_train_epoch):
        runner, model, buffer, steps = self.runner, self.model, self.buffer, self.steps

        results = runner.run()
        buffer.put(results)

        self.record_episode_info(results["episode_info"])
        obs, next_obs, actions, rewards, mus, dones, masks, goal_obs = self.adjust_shape(
            results)
        names_ops, values_ops = model.train_policy(obs, next_obs, actions,
                                                   rewards, dones, mus,
                                                   model.initial_state, masks,
                                                   steps, goal_obs)

        if buffer.has_atleast(replay_start):
            for i in range(nb_train_epoch):
                if i == 0:
                    results = buffer.get(use_cache=False)
                else:
                    results = buffer.get(use_cache=True)
                obs, next_obs, actions, rewards, mus, dones, masks, goal_obs = self.adjust_shape(
                    results)
                names_ops, values_ops = model.train_policy(
                    obs, next_obs, actions, rewards, dones, mus,
                    model.initial_state, masks, steps, goal_obs)
                self.episode_stats.feed(np.mean(rewards), "rewards")
                self.episode_stats.feed(results["her_gain"], "her_gain")

        if int(steps / runner.nbatch) % self.log_interval == 0:
            names_ops, values_ops = names_ops + [
                "memory_usage(GB)"
            ], values_ops + [self.buffer.memory_usage]
            self.log(names_ops, values_ops)

            if int(steps / runner.nbatch) % (self.log_interval * 200) == 0:
                self.save(
                    os.path.join(logger.get_dir(),
                                 "{}.pkl".format(self.steps)))

        if self.save_interval > 0 and int(
                steps / runner.nbatch) % self.save_interval == 0:
            results["acer_steps"] = self.steps
            self.recoder.store(results)
            self.recoder.dump()

    def adjust_shape(self, results):
        runner = self.runner

        obs = results["obs"][:, :-1].copy()
        # next_obs = results["obs"][:, 1:].copy()
        next_obs = results["next_obs"].copy()
        obs = obs.reshape((runner.nbatch, ) + runner.obs_shape)
        next_obs = next_obs.reshape((runner.nbatch, ) + runner.obs_shape)

        actions = results["actions"].reshape(runner.nbatch)
        rewards = results["rewards"].reshape(runner.nbatch)
        mus = results["mus"].reshape([runner.nbatch, runner.nact])
        dones = results["dones"].reshape([runner.nbatch])
        masks = results["masks"].reshape([runner.batch_ob_shape[0]])
        goal_obs = results["goal_obs"].reshape((runner.nbatch, ) +
                                               runner.obs_shape)
        return obs, next_obs, actions, rewards, mus, dones, masks, goal_obs

    def record_episode_info(self, episode_info):
        returns = episode_info.get("episode", None)
        if returns:
            self.episode_stats.feed(returns["r"], "episode_return")
            self.episode_stats.feed(returns["l"], "episode_length")

    def log(self, names_ops, values_ops):
        logger.record_tabular("total_timesteps", self.steps)
        logger.record_tabular("fps",
                              int(self.steps / (time.time() - self.tstart)))
        for name, val in zip(names_ops, values_ops):
            logger.record_tabular(name, float(val))
        for key in self.keys:
            logger.record_tabular(key, self.episode_stats.get_mean(key))
        logger.dump_tabular()
示例#11
0
    def run(self, acer_steps):
        if self.recoder is None:
            self.recoder = DataRecorder(os.path.join(logger.get_dir(), "runner_data"))
        if self.wrong_recorder is None:
            self.wrong_recorder = DataRecorder(os.path.join(logger.get_dir(), "wrong_data"))
        mb_obs, mb_next_obs, mb_actions, mb_rewards, mb_mus, mb_dones, mb_death = [], [], [], [], [], [], []
        mb_next_obs_infos, mb_desired_goal_infos = [], []
        mb_achieved_goal, mb_next_achieved_goal, mb_desired_goal, mb_desired_goal_state = [], [], [], []

        for step in range(self.nsteps):
            actions, mus = self.model.step({
                'obs': self.obs.copy(),
                'achieved_goal': self.achieved_goal.copy(),
                'desired_goal': self.desired_goal.copy(),
                'desired_goal_state': self.desired_goal_state.copy()
            })
            mb_obs.append(np.copy(self.obs))
            mb_achieved_goal.append(np.copy(self.achieved_goal))
            mb_desired_goal.append(np.copy(self.desired_goal))
            mb_desired_goal_state.append(np.copy(self.desired_goal_state))
            mb_actions.append(actions)
            mb_mus.append(mus)
            mb_desired_goal_infos.append(np.copy(self.desired_goal_info))

            # step
            if self.dict_obs:
                dict_obs, _, dones, infos = self.env.step(actions)
                obs, achieved_goal = dict_obs['observation'], dict_obs['achieved_goal']
                achieved_goal = np.tile(achieved_goal, [1, self.nb_tile])   # expand from 2-d to 256-d
            else:
                obs, _, dones, infos = self.env.step(actions)
            rewards = np.zeros(self.nenv, np.float32)
            death = np.array([False for _ in range(self.nenv)], dtype=np.bool)
            self.episode_step += 1
            for e in range(self.nenv):
                if infos[e]['x_pos'] == 65535:
                    infos[e]['x_pos'] = 0

            # get real next obs and achieved goal
            next_obs = obs.copy()
            next_achieved_goal = goal_info_to_embedding(infos, goal_dim=self.achieved_goal.shape[-1])
            for e in range(self.nenv):
                if dones[e]:
                    if self.dict_obs:
                        _dict_obs = infos[e]['next_obs']
                        _obs = _dict_obs['observation']
                    else:
                        _obs = infos[e].get('next_obs')
                    assert _obs is not None
                    next_obs[e] = _obs
            mb_next_obs.append(next_obs)
            mb_next_obs_infos.append(infos)
            mb_next_achieved_goal.append(next_achieved_goal)

            # detecting wrong x_pos:
            for e in range(self.nenv):
                x_pos = infos[e].get('x_pos')
                if x_pos > 3000:
                    logger.info('detected a wrong x_pos:{}'.format(x_pos))
                    data = {'obs': self.obs[e], 'next_obs': obs[e], 'action': actions[e], 'info': infos[e],
                            'episode_step': self.episode_step[e], 'true_next_obs': next_obs[e], 'acer_step': acer_steps}
                    self.wrong_recorder.store(data)
                    self.wrong_recorder.dump()

            # achieved & episode done
            for e in range(self.nenv):
                reached = self.check_goal_reached_v2(infos[e], self.desired_goal_info[e])
                if reached or self.episode_step[e] > self.curriculum.allow_step or infos[e]["x_pos"] > self.desired_goal_info[e]["x_pos"] + 100:
                    # log info
                    final_pos = {"x_pos": infos[e]["x_pos"], "y_pos": infos[e]["y_pos"]}
                    if reached:
                        succ = True
                    else:
                        succ = False
                    self.recoder.store(dict(env=e, succ=succ, length=self.episode_step[e], final_pos=final_pos))
                    logger.info(self.TEMPLATE.format(e, succ, self.desired_goal_info[e], final_pos, self.episode_step[e]))

                    # episode info
                    self.log_episode_step.append(self.episode_step[e])
                    self.log_episode_success.append(1.0 if succ else 0.0)
                    self.log_episode_x_pos.append(infos[e]['x_pos'])
                    self.log_episode_y_pos.append(infos[e]['y_pos'])
                    self.episode_step[e] = 0

                    # reward and dones
                    if reached:
                        rewards[e] = 1.0
                    dones[e] = True

                    # reset
                    if self.dict_obs:
                        _dict_obs = self.env.reset_v2(e)
                        obs[e], achieved_goal[e] = _dict_obs['observation'][0], np.tile(_dict_obs['achieved_goal'][0], self.nb_tile)
                        assert np.array_equal(achieved_goal[e], np.tile(np.array([40., 176.]), self.nb_tile))
                    else:
                        _obs = self.env.reset_v2(e)[0]
                        obs[e] = _obs
                    # curriculum
                    self.curriculum.update(succ=succ, acer_steps=acer_steps)
                    self.desired_goal[e], self.desired_goal_state[e], self.desired_goal_info[e] = self.curriculum.get_current_target(nb_goal=1)
                elif dones[e]:
                    # log info
                    final_pos = {"x_pos": infos[e]["x_pos"], "y_pos": infos[e]["y_pos"]}
                    self.recoder.store(dict(env=e, succ=False, length=self.episode_step[e], final_pos=final_pos))
                    logger.info(self.TEMPLATE.format(e, False, self.desired_goal_info[e], final_pos, self.episode_step[e]))

                    # episode info
                    self.log_episode_step.append(self.episode_step[e])
                    self.log_episode_success.append(0.0)
                    self.log_episode_x_pos.append(infos[e]['x_pos'])
                    self.log_episode_y_pos.append(infos[e]['y_pos'])
                    self.episode_step[e] = 0

                    # reward and death info
                    if infos[e]['is_dying'] or infos[e]['is_dead']:
                        death[e] = True
                        if self.include_death:
                            rewards[e] = -1
                    # curriculum
                    self.curriculum.update(succ=False, acer_steps=acer_steps)
                    self.desired_goal[e], self.desired_goal_state[e], self.desired_goal_info[e] = self.curriculum.get_current_target(nb_goal=1)
            # states information for statefull models like LSTM
            self.obs = obs
            if self.dict_obs:
                self.achieved_goal = achieved_goal
            mb_rewards.append(rewards)
            mb_death.append(death)
            mb_dones.append(dones)

        mb_obs = np.asarray(mb_obs, dtype=self.obs_dtype).swapaxes(1, 0)
        mb_next_obs = np.asarray(mb_next_obs, dtype=self.obs_dtype).swapaxes(1, 0)
        mb_achieved_goal = np.asarray(mb_achieved_goal, dtype=np.float32).swapaxes(1, 0)
        mb_next_achieved_goal = np.asarray(mb_next_achieved_goal, dtype=np.float32).swapaxes(1, 0)
        mb_desired_goal = np.asarray(mb_desired_goal, dtype=np.float32).swapaxes(1, 0)
        mb_desired_goal_state = np.asarray(mb_desired_goal_state, dtype=self.obs_dtype).swapaxes(1, 0)
        mb_actions = np.asarray(mb_actions, dtype=self.ac_dtype).swapaxes(1, 0)
        mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0)

        mb_mus = np.asarray(mb_mus, dtype=np.float32).swapaxes(1, 0)
        mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0)
        mb_death = np.asarray(mb_death, dtype=np.bool).swapaxes(1, 0)

        mb_next_obs_infos = np.asarray(mb_next_obs_infos, dtype=object).swapaxes(1, 0)
        mb_desired_goal_infos = np.asarray(mb_desired_goal_infos, dtype=object).swapaxes(1, 0)
        if not np.array_equal(mb_rewards, self.reward_fn(mb_next_obs_infos, mb_desired_goal_infos)):
            import ipdb
            ipdb.set_trace()
        results = dict(
            obs=mb_obs,
            next_obs=mb_next_obs,
            achieved_goal=mb_achieved_goal,
            next_achieved_goal=mb_next_achieved_goal,
            desired_goal=mb_desired_goal,
            desired_goal_state=mb_desired_goal_state,
            actions=mb_actions,
            rewards=mb_rewards,
            mus=mb_mus,
            dones=mb_dones,
            deaths=mb_death,
            next_obs_infos=mb_next_obs_infos,
            desired_goal_infos=mb_desired_goal_infos,
        )
        return results
示例#12
0
class Runner:
    TEMPLATE = 'env_{} {}!|goal:{}|final_pos:{}|length:{}'

    def __init__(self, env, model, curriculum, nsteps, reward_fn, threshold):
        assert isinstance(env.action_space, spaces.Discrete), 'This ACER implementation works only with discrete action spaces!'
        assert isinstance(env, VecFrameStack)

        self.env = env
        self.model = model
        self.policy_inputs = self.model.policy_inputs
        self.nenv = nenv = env.num_envs if hasattr(env, 'num_envs') else 1
        self.nact = env.action_space.n
        self.nbatch = nenv * nsteps
        self.obs_shape = self.model.obs_shape
        self.obs_dtype = self.model.obs_dtype
        self.ac_dtype = env.action_space.dtype
        self.achieved_goal_shape = self.model.achieved_goal_sh
        self.desired_goal_shape = self.model.desired_goal_sh
        self.desired_goal_state_shape = self.model.desired_goal_state_sh

        self.dict_obs = isinstance(self.env.observation_space, spaces.Dict)

        self.obs = np.zeros((nenv,) + self.obs_shape, dtype=self.obs_dtype)
        self.achieved_goal = np.zeros((nenv, ) + self.achieved_goal_shape, dtype=np.float32)
        self.desired_goal = np.zeros((nenv, ) + self.desired_goal_shape, dtype=np.float32)
        self.desired_goal_state = np.zeros((nenv, ) + self.desired_goal_state_shape, dtype=self.obs_dtype)
        self.desired_goal_info = np.zeros((nenv, ), dtype=object)

        self.nb_tile = self.achieved_goal.shape[-1] // 2
        if self.dict_obs:
            dict_obs = self.env.reset()
            self.obs[:] = dict_obs['observation']
            achieved_goal = dict_obs["achieved_goal"]
            self.achieved_goal[:] = np.tile(achieved_goal, [1, self.nb_tile])
        else:
            self.obs[:] = self.env.reset()

        self.nsteps = nsteps

        self.curriculum = curriculum
        self.desired_goal[:], self.desired_goal_state[:], self.desired_goal_info[:] = self.curriculum.get_current_target(nb_goal=self.nenv)

        self.recoder = None
        self.wrong_recorder = None
        self.episode_step = np.zeros(self.nenv, dtype=np.int32)
        self.reward_fn = reward_fn
        self.threshold = threshold
        self.include_death = False

        self.log_episode_step = deque(maxlen=10)
        self.log_episode_success = deque(maxlen=10)
        self.log_episode_x_pos = deque(maxlen=10)
        self.log_episode_y_pos = deque(maxlen=10)

    def run(self, acer_steps):
        if self.recoder is None:
            self.recoder = DataRecorder(os.path.join(logger.get_dir(), "runner_data"))
        if self.wrong_recorder is None:
            self.wrong_recorder = DataRecorder(os.path.join(logger.get_dir(), "wrong_data"))
        mb_obs, mb_next_obs, mb_actions, mb_rewards, mb_mus, mb_dones, mb_death = [], [], [], [], [], [], []
        mb_next_obs_infos, mb_desired_goal_infos = [], []
        mb_achieved_goal, mb_next_achieved_goal, mb_desired_goal, mb_desired_goal_state = [], [], [], []

        for step in range(self.nsteps):
            actions, mus = self.model.step({
                'obs': self.obs.copy(),
                'achieved_goal': self.achieved_goal.copy(),
                'desired_goal': self.desired_goal.copy(),
                'desired_goal_state': self.desired_goal_state.copy()
            })
            mb_obs.append(np.copy(self.obs))
            mb_achieved_goal.append(np.copy(self.achieved_goal))
            mb_desired_goal.append(np.copy(self.desired_goal))
            mb_desired_goal_state.append(np.copy(self.desired_goal_state))
            mb_actions.append(actions)
            mb_mus.append(mus)
            mb_desired_goal_infos.append(np.copy(self.desired_goal_info))

            # step
            if self.dict_obs:
                dict_obs, _, dones, infos = self.env.step(actions)
                obs, achieved_goal = dict_obs['observation'], dict_obs['achieved_goal']
                achieved_goal = np.tile(achieved_goal, [1, self.nb_tile])   # expand from 2-d to 256-d
            else:
                obs, _, dones, infos = self.env.step(actions)
            rewards = np.zeros(self.nenv, np.float32)
            death = np.array([False for _ in range(self.nenv)], dtype=np.bool)
            self.episode_step += 1
            for e in range(self.nenv):
                if infos[e]['x_pos'] == 65535:
                    infos[e]['x_pos'] = 0

            # get real next obs and achieved goal
            next_obs = obs.copy()
            next_achieved_goal = goal_info_to_embedding(infos, goal_dim=self.achieved_goal.shape[-1])
            for e in range(self.nenv):
                if dones[e]:
                    if self.dict_obs:
                        _dict_obs = infos[e]['next_obs']
                        _obs = _dict_obs['observation']
                    else:
                        _obs = infos[e].get('next_obs')
                    assert _obs is not None
                    next_obs[e] = _obs
            mb_next_obs.append(next_obs)
            mb_next_obs_infos.append(infos)
            mb_next_achieved_goal.append(next_achieved_goal)

            # detecting wrong x_pos:
            for e in range(self.nenv):
                x_pos = infos[e].get('x_pos')
                if x_pos > 3000:
                    logger.info('detected a wrong x_pos:{}'.format(x_pos))
                    data = {'obs': self.obs[e], 'next_obs': obs[e], 'action': actions[e], 'info': infos[e],
                            'episode_step': self.episode_step[e], 'true_next_obs': next_obs[e], 'acer_step': acer_steps}
                    self.wrong_recorder.store(data)
                    self.wrong_recorder.dump()

            # achieved & episode done
            for e in range(self.nenv):
                reached = self.check_goal_reached_v2(infos[e], self.desired_goal_info[e])
                if reached or self.episode_step[e] > self.curriculum.allow_step or infos[e]["x_pos"] > self.desired_goal_info[e]["x_pos"] + 100:
                    # log info
                    final_pos = {"x_pos": infos[e]["x_pos"], "y_pos": infos[e]["y_pos"]}
                    if reached:
                        succ = True
                    else:
                        succ = False
                    self.recoder.store(dict(env=e, succ=succ, length=self.episode_step[e], final_pos=final_pos))
                    logger.info(self.TEMPLATE.format(e, succ, self.desired_goal_info[e], final_pos, self.episode_step[e]))

                    # episode info
                    self.log_episode_step.append(self.episode_step[e])
                    self.log_episode_success.append(1.0 if succ else 0.0)
                    self.log_episode_x_pos.append(infos[e]['x_pos'])
                    self.log_episode_y_pos.append(infos[e]['y_pos'])
                    self.episode_step[e] = 0

                    # reward and dones
                    if reached:
                        rewards[e] = 1.0
                    dones[e] = True

                    # reset
                    if self.dict_obs:
                        _dict_obs = self.env.reset_v2(e)
                        obs[e], achieved_goal[e] = _dict_obs['observation'][0], np.tile(_dict_obs['achieved_goal'][0], self.nb_tile)
                        assert np.array_equal(achieved_goal[e], np.tile(np.array([40., 176.]), self.nb_tile))
                    else:
                        _obs = self.env.reset_v2(e)[0]
                        obs[e] = _obs
                    # curriculum
                    self.curriculum.update(succ=succ, acer_steps=acer_steps)
                    self.desired_goal[e], self.desired_goal_state[e], self.desired_goal_info[e] = self.curriculum.get_current_target(nb_goal=1)
                elif dones[e]:
                    # log info
                    final_pos = {"x_pos": infos[e]["x_pos"], "y_pos": infos[e]["y_pos"]}
                    self.recoder.store(dict(env=e, succ=False, length=self.episode_step[e], final_pos=final_pos))
                    logger.info(self.TEMPLATE.format(e, False, self.desired_goal_info[e], final_pos, self.episode_step[e]))

                    # episode info
                    self.log_episode_step.append(self.episode_step[e])
                    self.log_episode_success.append(0.0)
                    self.log_episode_x_pos.append(infos[e]['x_pos'])
                    self.log_episode_y_pos.append(infos[e]['y_pos'])
                    self.episode_step[e] = 0

                    # reward and death info
                    if infos[e]['is_dying'] or infos[e]['is_dead']:
                        death[e] = True
                        if self.include_death:
                            rewards[e] = -1
                    # curriculum
                    self.curriculum.update(succ=False, acer_steps=acer_steps)
                    self.desired_goal[e], self.desired_goal_state[e], self.desired_goal_info[e] = self.curriculum.get_current_target(nb_goal=1)
            # states information for statefull models like LSTM
            self.obs = obs
            if self.dict_obs:
                self.achieved_goal = achieved_goal
            mb_rewards.append(rewards)
            mb_death.append(death)
            mb_dones.append(dones)

        mb_obs = np.asarray(mb_obs, dtype=self.obs_dtype).swapaxes(1, 0)
        mb_next_obs = np.asarray(mb_next_obs, dtype=self.obs_dtype).swapaxes(1, 0)
        mb_achieved_goal = np.asarray(mb_achieved_goal, dtype=np.float32).swapaxes(1, 0)
        mb_next_achieved_goal = np.asarray(mb_next_achieved_goal, dtype=np.float32).swapaxes(1, 0)
        mb_desired_goal = np.asarray(mb_desired_goal, dtype=np.float32).swapaxes(1, 0)
        mb_desired_goal_state = np.asarray(mb_desired_goal_state, dtype=self.obs_dtype).swapaxes(1, 0)
        mb_actions = np.asarray(mb_actions, dtype=self.ac_dtype).swapaxes(1, 0)
        mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0)

        mb_mus = np.asarray(mb_mus, dtype=np.float32).swapaxes(1, 0)
        mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0)
        mb_death = np.asarray(mb_death, dtype=np.bool).swapaxes(1, 0)

        mb_next_obs_infos = np.asarray(mb_next_obs_infos, dtype=object).swapaxes(1, 0)
        mb_desired_goal_infos = np.asarray(mb_desired_goal_infos, dtype=object).swapaxes(1, 0)
        if not np.array_equal(mb_rewards, self.reward_fn(mb_next_obs_infos, mb_desired_goal_infos)):
            import ipdb
            ipdb.set_trace()
        results = dict(
            obs=mb_obs,
            next_obs=mb_next_obs,
            achieved_goal=mb_achieved_goal,
            next_achieved_goal=mb_next_achieved_goal,
            desired_goal=mb_desired_goal,
            desired_goal_state=mb_desired_goal_state,
            actions=mb_actions,
            rewards=mb_rewards,
            mus=mb_mus,
            dones=mb_dones,
            deaths=mb_death,
            next_obs_infos=mb_next_obs_infos,
            desired_goal_infos=mb_desired_goal_infos,
        )
        return results

    def check_goal_reached_v2(self, obs_info, goal_info):
        obs_x, obs_y = float(obs_info["x_pos"]), float(obs_info["y_pos"])
        goal_x, goal_y = float(goal_info["x_pos"]), float(goal_info["y_pos"])
        diff_x = abs(obs_x - goal_x)
        diff_y = abs(obs_y - goal_y)
        if diff_x <= self.threshold[0] and diff_y <= self.threshold[1]:
            status = True
        else:
            status = False
        return status

    def evaluate(self):
        tstart = time.time()
        n_episode = 0
        self.log_episode_step = deque(maxlen=10)
        self.log_episode_success = deque(maxlen=10)
        self.log_episode_x_pos = deque(maxlen=10)
        self.log_episode_y_pos = deque(maxlen=10)
        self.episode_step[:] = 0

        if self.dict_obs:
            dict_obs = self.env.reset()
            self.obs[:] = dict_obs['observation']
            achieved_goal = dict_obs["achieved_goal"]
            self.achieved_goal[:] = np.tile(achieved_goal, [1, self.nb_tile])
        else:
            self.obs[:] = self.env.reset()

        while n_episode < 10:
            while True:
                actions, mus = self.model.step({
                    'obs': self.obs.copy(),
                    'achieved_goal': self.achieved_goal.copy(),
                    'desired_goal': self.desired_goal.copy(),
                    'desired_goal_state': self.desired_goal_state.copy()
                })
                # step
                if self.dict_obs:
                    dict_obs, _, dones, infos = self.env.step(actions)
                    obs, achieved_goal = dict_obs['observation'], dict_obs['achieved_goal']
                    achieved_goal = np.tile(achieved_goal, [1, self.nb_tile])   # expand from 2-d to 256-d
                else:
                    obs, _, dones, infos = self.env.step(actions)
                self.episode_step += 1
                for e in range(self.nenv):
                    if infos[e]['x_pos'] == 65535:
                        infos[e]['x_pos'] = 0

                # achieved & episode done
                for e in range(self.nenv):
                    reached = self.check_goal_reached_v2(infos[e], self.desired_goal_info[e])
                    if reached or self.episode_step[e] > self.curriculum.allow_step or infos[e]["x_pos"] > self.desired_goal_info[e]["x_pos"] + 100:
                        self.log_episode_step.append(self.episode_step[e])
                        self.log_episode_success.append(1.0 if reached else 0.0)
                        self.log_episode_x_pos.append(infos[e]['x_pos'])
                        self.log_episode_y_pos.append(infos[e]['y_pos'])
                        self.episode_step[e] = 0
                        dones[e] = True
                        n_episode += 1
                        # reset
                        if self.dict_obs:
                            _dict_obs = self.env.reset_v2(e)
                            obs[e], achieved_goal[e] = _dict_obs['observation'][0], np.tile(_dict_obs['achieved_goal'][0], self.nb_tile)
                            assert np.array_equal(achieved_goal[e], np.tile(np.array([40., 176.]), self.nb_tile))
                        else:
                            _obs = self.env.reset_v2(e)[0]
                            obs[e] = _obs
                    elif dones[e]:
                        # episode info
                        self.log_episode_step.append(self.episode_step[e])
                        self.log_episode_success.append(0.0)
                        self.log_episode_x_pos.append(infos[e]['x_pos'])
                        self.log_episode_y_pos.append(infos[e]['y_pos'])
                        self.episode_step[e] = 0
                        n_episode += 1
                # states information for statefull models like LSTM
                self.obs = obs
                if self.dict_obs:
                    self.achieved_goal = achieved_goal
                if n_episode >= 10:
                    break
        logs = list()
        logs.append(('test/final_x_pos', np.mean(self.log_episode_x_pos)))
        logs.append(('test/final_y_pos', np.mean(self.log_episode_y_pos)))
        logs.append(('test/success', np.mean(self.log_episode_success)))
        logs.append(('test/episode_length', np.mean(self.log_episode_step)))
        logs.append(('time/evaluate', time.time() - tstart))
        return logs

    def logs(self):
        logs = list()
        logs.append(('train/final_x_pos', self._safe_mean(self.log_episode_x_pos)))
        logs.append(('train/final_y_pos', self._safe_mean(self.log_episode_y_pos)))
        logs.append(('train/success', self._safe_mean(self.log_episode_success)))
        logs.append(('train/episode_length', self._safe_mean(self.log_episode_step)))
        return logs

    @staticmethod
    def _safe_mean(x):
        if len(x) == 0:
            return 0.
        else:
            return np.mean(x)
示例#13
0
class Runner(AbstractEnvRunner):
    def __init__(self,
                 env,
                 model,
                 nsteps,
                 store_data,
                 reward_fn,
                 sample_goal,
                 threshold=None,
                 alt_model=None,
                 use_random_policy_expl=None):
        super().__init__(env=env, model=model, nsteps=nsteps)
        assert isinstance(
            env.action_space, spaces.Discrete
        ), 'This ACER implementation works only with discrete action spaces!'
        assert isinstance(env, VecFrameStack)

        self.nact = env.action_space.n
        nenv = self.nenv
        self.nbatch = nenv * nsteps
        self.batch_ob_shape = (nenv *
                               (nsteps + 1), ) + env.observation_space.shape

        # self.obs = env.reset()  super method do this
        self.obs_dtype = env.observation_space.dtype
        self.obs_shape = env.observation_space.shape
        self.ac_dtype = env.action_space.dtype
        self.ac_shape = env.action_space.shape
        self.nstack = self.env.nstack
        self.nc = self.batch_ob_shape[-1] // self.nstack
        self.goal_shape = self.model.goal_shape
        self.goal_as_image = self.model.goal_as_image

        self.save_path = os.path.join(logger.get_dir(), "runner_data")
        self.store_data = store_data
        self.recorder = DataRecorder(self.save_path)

        self.dynamics = self.model.dynamics
        self.sample_goal = sample_goal
        self.threshold = threshold
        # self.batch_goal_feat_shape = (nenv*(nsteps+1),) + env.observation_space.shape + self.dynamics.feat_shape
        self.reached_status = np.array([False for _ in range(self.nenv)],
                                       dtype=bool)
        self.goals, self.goal_info = None, None
        self.reward_fn = reward_fn
        # self.results_writer = ResultsWriter(os.path.join(save_path, "evaluation.csv"))

        self.episode = np.ones(self.nenv)
        self.episode_step = np.zeros(self.nenv)
        self.episode_reached_step = np.zeros(self.nenv)
        self.episode_reward_to_go = np.zeros(self.nenv)

        self.name = self.model.scope.split("acer_")[1]

        self.alt_model = alt_model
        self.use_random_policy_expl = use_random_policy_expl
        if self.use_random_policy_expl:
            assert alt_model is not None

    def run(self, acer_step=None):
        if self.goals is None:
            self.goals, self.goal_info = self.dynamics.get_goal(
                nb_goal=self.nenv)
            if not self.goal_as_image:
                self.goals = goal_to_embedding(self.goal_info)
        mb_obs = np.empty((self.nenv, self.nsteps) + self.obs_shape,
                          dtype=self.obs_dtype)
        mb_next_obs = np.empty((self.nenv, self.nsteps) + self.obs_shape,
                               dtype=self.obs_dtype)
        mb_act = np.empty((self.nenv, self.nsteps) + self.ac_shape,
                          dtype=self.ac_dtype)
        mb_mus = np.empty((self.nenv, self.nsteps, self.nact),
                          dtype=np.float32)
        mb_ext_dones = np.empty((self.nenv, self.nsteps), dtype=bool)
        mb_int_dones = np.empty((self.nenv, self.nsteps), dtype=bool)
        mb_masks = np.empty((self.nenv, self.nsteps + 1), dtype=bool)
        mb_ext_rew = np.empty((self.nenv, self.nsteps), dtype=np.float32)
        mb_next_obs_infos = np.empty((self.nenv, self.nsteps), dtype=object)
        mb_goals = np.empty((self.nenv, self.nsteps) + self.goal_shape,
                            dtype=self.obs_dtype)
        mb_goal_infos = np.empty((self.nenv, self.nsteps), dtype=object)

        reached_step, done_step = np.array([
            None for _ in range(self.nenv)
        ]), np.array([None for _ in range(self.nenv)])

        episode_infos = np.asarray([{} for _ in range(self.nenv)],
                                   dtype=object)
        for step in range(self.nsteps):
            check_obs(self.obs)

            actions, mus, states = self.model.step(self.obs,
                                                   S=self.states,
                                                   M=self.dones,
                                                   goals=self.goals)
            if self.sample_goal:
                if self.use_random_policy_expl:
                    actions[self.reached_status] = self.simple_random_action(
                        np.sum(self.reached_status))
                    mus[self.reached_status] = self.get_mu_of_random_action()
                else:
                    if np.sum(self.reached_status) > 0:
                        alt_action, alt_mu, alt_states = self.alt_model.step(
                            self.obs,
                            S=self.states,
                            M=self.dones,
                            goals=self.goals)
                        actions[self.reached_status] = alt_action[
                            self.reached_status]
                        mus[self.reached_status] = alt_mu[self.reached_status]

            mb_obs[:, step] = deepcopy(self.obs)
            mb_act[:, step] = actions
            mb_mus[:, step, :] = mus
            mb_masks[:, step] = deepcopy(self.dones)

            obs, rewards, dones, infos = self.env.step(actions)
            check_infos(infos, self.recorder, dones, acer_step)
            for info in infos:
                info.update({"source": self.name})

            mb_ext_dones[:, step] = dones
            mb_ext_rew[:, step] = rewards
            self.episode_reward_to_go[self.reached_status] += rewards[
                self.reached_status]
            mb_next_obs[:, step] = self.get_real_next_obs(obs, dones, infos)
            mb_next_obs_infos[:, step] = np.asarray(infos, dtype=object)
            mb_goals[:, step] = deepcopy(self.goals)
            mb_goal_infos[:, step] = deepcopy(self.goal_info)
            self.episode_step += 1
            # states information for statefull models like LSTM
            self.states = states
            self.dones = dones
            self.obs = obs

            # check reached
            reached_step = self.update_reach(reached_step, infos, step)
            # check done
            done_step[self.dones] = step

            # revise goal
            if not self.sample_goal:
                mb_goals, mb_goal_infos = self.update_goal_v1(
                    mb_next_obs, mb_goals, infos, mb_goal_infos, done_step,
                    step)
            else:
                mb_goals, mb_goal_infos = self.update_goal_v2(
                    mb_next_obs, mb_goals, infos, mb_goal_infos, reached_step,
                    step)
            # summary
            episode_infos = self.summary(episode_infos, infos, acer_step)

        mb_int_rewards = self.reward_fn(mb_next_obs_infos, mb_goal_infos)
        mb_int_dones.fill(False)
        int_dones_index = np.where(mb_int_rewards)
        mb_int_dones[int_dones_index] = True
        # shapes are adjusted to [nenv, nsteps, []]

        self.recorder.dump()
        results = dict(
            obs=mb_obs,
            next_obs=mb_next_obs,
            actions=mb_act,
            ext_rewards=mb_ext_rew,
            mus=mb_mus,
            ext_dones=mb_ext_dones,
            int_dones=mb_int_dones,
            masks=mb_masks,
            next_obs_infos=
            mb_next_obs_infos,  # nenv, nsteps, two purpose: 1)put into dynamics; 2) put into buffer
            episode_infos=episode_infos,
            goal_obs=mb_goals,  # nenv, nsteps+1,
            goal_infos=mb_goal_infos,
            int_rewards=mb_int_rewards)
        return results

    def simple_random_action(self, nb_action):
        return np.random.randint(0, self.env.action_space.n, nb_action)

    def get_mu_of_random_action(self):
        assert isinstance(self.env.action_space, spaces.Discrete)
        return np.array([
            1 / self.env.action_space.n for _ in range(self.env.action_space.n)
        ])

    def initialize(self, init_steps):
        mb_obs, mb_actions, mb_next_obs, mb_goal_infos = [], [], [], []
        for _ in range(init_steps):
            mb_obs.append(deepcopy(self.obs))
            actions = np.asarray(
                [self.env.action_space.sample() for _ in range(self.nenv)])
            self.obs, rewards, dones, infos = self.env.step(actions)
            goal_infos = np.array([{
                "x_pos": info.get("x_pos", None),
                "y_pos": info.get("y_pos", None),
                "source": self.name
            } for info in infos],
                                  dtype=object)
            next_obs = self.get_real_next_obs(np.copy(self.obs), dones, infos)
            mb_next_obs.append(next_obs)
            mb_goal_infos.append(goal_infos)
            mb_actions.append(actions)

        mb_obs = np.asarray(mb_obs).swapaxes(1, 0)  # (nenv, nstep, obs_shape)
        mb_goal_infos = np.asarray(mb_goal_infos, dtype=object).swapaxes(
            1, 0)  # (nenv, nstep, dict)
        mb_actions = np.asarray(mb_actions).swapaxes(1, 0)
        mb_next_obs = np.asarray(mb_next_obs).swapaxes(1, 0)

        mb_obs = mb_obs.reshape((-1, ) + mb_obs.shape[2:])
        mb_goal_infos = mb_goal_infos.reshape(-1, )
        mb_actions = mb_actions.reshape((-1, ) + mb_actions.shape[2:])
        mb_next_obs = mb_next_obs.reshape((-1, ) + mb_next_obs.shape[2:])

        for i in range(10):
            batch_size = min(64, init_steps)
            ind = np.random.randint(0, init_steps, batch_size)
            obs, actions, next_obs = mb_obs[ind], mb_actions[ind], mb_next_obs[
                ind]
            nb_train_epoch = 1
            self.model.train_dynamics(obs, actions, next_obs, nb_train_epoch)
        self.dynamics.put_goal(mb_obs, mb_actions, mb_next_obs, mb_goal_infos)
        self.obs = self.env.reset()

    def evaluate(self, nb_eval):
        assert self.dynamics.dummy
        goal_obs, goal_info = self.dynamics.get_goal(
            nb_goal=self.nenv)  # (nenv, goal_dim)
        eval_info = {"l": 0, "r": 0, "x_pos": 0, "y_pos": 0}
        for i in range(nb_eval):
            terminal = False
            while True:
                actions, mus, states = self.model.step(self.obs,
                                                       S=self.states,
                                                       M=self.dones,
                                                       goals=goal_obs)
                obs, rewards, dones, infos = self.env.step(actions)
                info = infos[0]
                if info.get("episode"):
                    assert dones[0]
                    eval_info["l"] += info.get("episode")["l"]
                    eval_info["r"] += info.get("episode")["r"]
                    eval_info["x_pos"] += info.get("x_pos")
                    eval_info["y_pos"] += info.get("y_pos")
                    terminal = True
                if terminal:
                    break
                self.states = states
                self.dones = dones
                self.obs = obs
            self.obs = self.env.reset()
        for key in eval_info.keys():
            eval_info[key] /= nb_eval
        return eval_info

    def log(self, mem):
        succ = "succ" if mem["is_succ"] else "fail"
        template = "env_{} {}|goal:{}|final_pos:{}|size:{}".format(
            mem["env"], succ, {
                "x_pos": mem["goal"]["x_pos"],
                "y_pos": mem["goal"]["y_pos"]
            }, mem["final_pos"], self.dynamics.queue.qsize())
        logger.info(template)

    def summary(self, episode_infos, infos, acer_step):
        for env_idx in range(self.nenv):
            info = infos[env_idx]
            if self.dones[env_idx]:
                assert info.get("episode")
                if info.get("episode"):
                    episode_infos[env_idx]["episode"] = info.get("episode")
                if not self.sample_goal:
                    episode_infos[env_idx]["reached_info"] = dict(
                        source=self.name,
                        x_pos=infos[env_idx]["x_pos"],
                        y_pos=infos[env_idx]["y_pos"])
                else:
                    if self.reached_status[env_idx]:
                        reached = 1.0
                        time_ratio = self.episode_reached_step[
                            env_idx] / self.episode_step[env_idx]
                        achieved_pos = {
                            "x_pos": infos[env_idx]["x_pos"],
                            "y_pos": infos[env_idx]["y_pos"]
                        }
                        mem = dict(env=env_idx,
                                   is_succ=True,
                                   goal=self.goal_info[env_idx],
                                   final_pos=achieved_pos,
                                   timestep=acer_step,
                                   episode=self.episode[env_idx],
                                   step=self.episode_step[env_idx])
                        self.recorder.store(mem)
                        self.log(mem)
                        abs_dist = 10
                    else:
                        reached = 0.0
                        time_ratio = 1.0
                        achieved_pos = {
                            "x_pos": infos[env_idx]["x_pos"],
                            "y_pos": infos[env_idx]["y_pos"]
                        }
                        mem = dict(env=env_idx,
                                   is_succ=False,
                                   goal=self.goal_info[env_idx],
                                   final_pos=achieved_pos,
                                   timestep=acer_step,
                                   episode=self.episode[env_idx],
                                   step=self.episode_step[env_idx])
                        self.recorder.store(mem)
                        self.log(mem)
                        abs_dist = abs(float(infos[env_idx]["x_pos"]) - float(self.goal_info[env_idx]["x_pos"])) + \
                                   abs(float(infos[env_idx]["y_pos"]) - float(self.goal_info[env_idx]["y_pos"]))
                    episode_infos[env_idx]["reached_info"] = dict(
                        reached=reached,
                        time_ratio=time_ratio,
                        abs_dist=abs_dist,
                        source=self.name,
                        x_pos=infos[env_idx]["x_pos"],
                        y_pos=infos[env_idx]["y_pos"])
                    episode_infos[env_idx]["goal_info"] = dict(
                        x_pos=self.goal_info[env_idx]["x_pos"],
                        y_pos=self.goal_info[env_idx]["y_pos"],
                        source=self.goal_info[env_idx]["source"],
                        reward_to_go=self.episode_reward_to_go[env_idx])
                    # re-plan goal
                    goal_obs, goal_info = self.dynamics.get_goal(nb_goal=1)
                    if self.goal_as_image:
                        self.goals[env_idx] = goal_obs[0]
                    else:
                        self.goals[env_idx] = goal_to_embedding(goal_info[0])
                    self.goal_info[env_idx] = goal_info[0]
                    self.episode[env_idx] += 1
                    self.episode_step[env_idx] = 0
                    self.episode_reached_step[env_idx] = 0
                    self.reached_status[env_idx] = False
                    self.episode_reward_to_go[env_idx] = 0
        return episode_infos

    def get_real_next_obs(self, next_obs, dones, infos):
        _next_obs = next_obs.copy()
        for env_idx in range(self.nenv):
            if dones[env_idx]:
                o = infos[env_idx].get("next_obs", None)
                assert o is not None
                _next_obs[env_idx] = o
        return _next_obs

    def update_reach(self, reached_step, infos, step):
        if self.sample_goal:
            for env_idx in range(self.nenv):
                if not self.reached_status[env_idx]:
                    self.reached_status[env_idx] = check_goal_reached(
                        infos[env_idx], self.goal_info[env_idx],
                        self.threshold)
                    if self.reached_status[env_idx]:
                        reached_step[env_idx] = step
                        self.episode_reached_step[env_idx] = deepcopy(
                            self.episode_step[env_idx])
        return reached_step

    def update_goal_v1(self, mb_next_obs, mb_goals, infos, mb_goal_infos,
                       done_step, step):
        assert not self.sample_goal
        for env_idx in range(self.nenv):
            if self.dones[env_idx]:
                # (- - done(t)) -> (done done, done(t))
                start, end = 0, step + 1
                if self.goal_as_image:
                    mb_goals[env_idx, start:end] = mb_next_obs[env_idx, step]
                else:
                    mb_goals[env_idx,
                             start:end] = goal_to_embedding(infos[env_idx])
                mb_goal_infos[env_idx, start:end] = infos[env_idx]
            elif step == self.nsteps - 1:
                if done_step[env_idx] is None:
                    # (- - t) -> (t, t, t)
                    start = 0
                else:
                    # (- - done - - t) -> (- - - t, t, t)
                    start = done_step[env_idx] + 1
                end = step + 1
                if end == start:
                    continue
                if self.goal_as_image:
                    mb_goals[env_idx, start:end] = mb_next_obs[env_idx, step]
                else:
                    mb_goals[env_idx,
                             start:end] = goal_to_embedding(infos[env_idx])
                mb_goal_infos[env_idx, start:end] = infos[env_idx]
        return mb_goals, mb_goal_infos

    def update_goal_v2(self, mb_next_obs, mb_goals, infos, mb_goal_infos,
                       reached_step, step):
        assert self.sample_goal
        for env_idx in range(self.nenv):
            if step != self.nsteps - 1:
                # dones is instant variable but reached_status is a transitive variable
                if self.dones[env_idx] and self.reached_status[env_idx]:
                    if reached_step[env_idx] is None:
                        # reach|[- - done] -> [done, done, done]
                        start, end = 0, step + 1
                        if self.goal_as_image:
                            mb_goals[env_idx, start:end] = mb_next_obs[env_idx,
                                                                       step]
                        else:
                            mb_goals[env_idx, start:end] = goal_to_embedding(
                                infos[env_idx])
                        mb_goal_infos[env_idx, start:end] = infos[env_idx]
                    else:
                        # [- - reach(done)] -> [ - - -]  if reached_step[env_idx] == step
                        # [- - reach - - done] -> [- - - done done done]
                        start, end = reached_step[env_idx] + 1, step + 1
                        if end == start:
                            continue
                        if self.goal_as_image:
                            mb_goals[env_idx, start:end] = mb_next_obs[env_idx,
                                                                       step]
                        else:
                            mb_goals[env_idx, start:end] = goal_to_embedding(
                                infos[env_idx])
                        mb_goal_infos[env_idx, start:end] = infos[env_idx]
                elif not self.dones[env_idx] and self.reached_status[env_idx]:
                    # reached|[ - - -]  if reached_step[env_idx] is None:
                    # [- - reached - -] if reached_step[env_idx] is not None
                    pass
                else:
                    # [- - - done] if self.dones[env_idx] and not self.reached_status[env_idx]
                    # [- - - - -] if not self.dones[env_idx] and not self.reached_status[env_idx]
                    pass
            else:
                if self.dones[env_idx] and self.reached_status[env_idx]:
                    if reached_step[env_idx] is None:
                        # reach|[- - done(t)] -> [done, done, done(t)]
                        start, end = 0, step + 1
                        if self.goal_as_image:
                            mb_goals[env_idx, start:end] = mb_next_obs[env_idx,
                                                                       step]
                        else:
                            mb_goals[env_idx, start:end] = goal_to_embedding(
                                infos[env_idx])
                        mb_goal_infos[env_idx, start:end] = infos[env_idx]
                    else:
                        # [- - reach(done)(t)] -> [- - -]
                        # [- - reach - - done(t)] -> [- - - done done done(t)]
                        start, end = reached_step[env_idx] + 1, step + 1
                        if end == start:
                            continue
                        if self.goal_as_image:
                            mb_goals[env_idx, start:end] = mb_next_obs[env_idx,
                                                                       step]
                        else:
                            mb_goals[env_idx, start:end] = goal_to_embedding(
                                infos[env_idx])
                        mb_goal_infos[env_idx, start:end] = infos[env_idx]
                elif not self.dones[env_idx] and self.reached_status[env_idx]:
                    if reached_step[env_idx] is None:
                        # reached|[ - - t]  -> reached|[t t t]
                        start, end = 0, step + 1
                    else:
                        # reached[- - r - -] -> reached|[- - - t t]
                        start, end = reached_step[env_idx] + 1, step + 1
                    if end == start:
                        continue
                    if self.goal_as_image:
                        mb_goals[env_idx, start:end] = mb_next_obs[env_idx,
                                                                   step]
                    else:
                        mb_goals[env_idx,
                                 start:end] = goal_to_embedding(infos[env_idx])
                else:
                    # [- - - done(t)]  if self.dones[env_idx] and not self.reached_status[env_idx]
                    # [- - - - (t)] if not self.dones[env_idx] and not self.reached_status[env_idx]
                    pass
        return mb_goals, mb_goal_infos
示例#14
0
    def __init__(self, sess, env, auxiliary_task, queue_size, feat_dim,
                 normalize_novelty):
        self.sess = sess
        self.dummy = False
        self.make_auxiliary_task = {
            "RF": RandomFeature,
            "IDF": InverseDynamics,
            "RND": RandomNetworkDistillation
        }[auxiliary_task.upper()]
        self.auxiliary_task = self.make_auxiliary_task(env, feat_dim)
        self.obs = self.auxiliary_task.obs
        self.next_obs = self.auxiliary_task.next_obs
        self.ac = self.auxiliary_task.ac
        self.ac_space = self.auxiliary_task.ac_space

        self.feat = tf.stop_gradient(self.auxiliary_task.feature)
        self.feat_shape = tuple(self.feat.get_shape().as_list()[1:])
        self.feat_var = tf.reduce_mean(tf.nn.moments(self.feat, axes=-1)[1])
        self.out_feat = tf.stop_gradient(self.auxiliary_task.next_feature)
        self.nenv = env.num_envs if hasattr(env, 'num_envs') else 1

        with tf.variable_scope("dynamics"):
            self.novelty_tf = tf.placeholder(tf.float32, [None],
                                             "novelty_placeholder")
            if isinstance(self.auxiliary_task, RandomNetworkDistillation):
                self.dyna_loss = tf.zeros([])
                self.novelty = self.auxiliary_task.get_novelty()
            elif isinstance(self.auxiliary_task,
                            InverseDynamics) or isinstance(
                                self.auxiliary_task, RandomFeature):
                with tf.variable_scope("loss"):
                    self.novelty = self._get_novelty()
                    self.dyna_loss = tf.reduce_mean(self.novelty)
            else:
                raise NotImplementedError

        self.dyna_params = tf.trainable_variables("dynamics")
        self.aux_params = tf.trainable_variables(self.auxiliary_task.scope)
        self.params = self.dyna_params + self.aux_params

        self.aux_loss = self.auxiliary_task.loss
        self.loss = self.aux_loss + self.dyna_loss

        self.queue = PriorityQueue(queue_size)
        self.novelty_rms = RunningMeanStd(epsilon=1e-4)
        self.novelty_normalized = tf.clip_by_value(
            (self.novelty_tf - self.novelty_rms.mean) / self.novelty_rms.std,
            -5., 5.)

        self.normalized = normalize_novelty
        if normalize_novelty:
            logger.info("normalize novelty")
        path = logger.get_dir()
        path = os.path.join(path, "goal_data")
        self.goal_recoder = DataRecorder(path)
        self.goal_store_baseline = 1500

        self.density_estimate = deque(maxlen=int(1e4))

        self.eval_interval = 20
        self.eval_data_status = {}
        self.eval_data = []
        path = logger.get_dir()
        self.eval_path = path = os.path.join(path, "novelty_evaluation")
        self.eval_recoder = DataRecorder(path)

        path = logger.get_dir()
        path = os.path.join(path, "error_goal")
        self.error_recoder = DataRecorder(path)
示例#15
0
class Dynamics:
    def __init__(self, sess, env, auxiliary_task, queue_size, feat_dim,
                 normalize_novelty):
        self.sess = sess
        self.dummy = False
        self.make_auxiliary_task = {
            "RF": RandomFeature,
            "IDF": InverseDynamics,
            "RND": RandomNetworkDistillation
        }[auxiliary_task.upper()]
        self.auxiliary_task = self.make_auxiliary_task(env, feat_dim)
        self.obs = self.auxiliary_task.obs
        self.next_obs = self.auxiliary_task.next_obs
        self.ac = self.auxiliary_task.ac
        self.ac_space = self.auxiliary_task.ac_space

        self.feat = tf.stop_gradient(self.auxiliary_task.feature)
        self.feat_shape = tuple(self.feat.get_shape().as_list()[1:])
        self.feat_var = tf.reduce_mean(tf.nn.moments(self.feat, axes=-1)[1])
        self.out_feat = tf.stop_gradient(self.auxiliary_task.next_feature)
        self.nenv = env.num_envs if hasattr(env, 'num_envs') else 1

        with tf.variable_scope("dynamics"):
            self.novelty_tf = tf.placeholder(tf.float32, [None],
                                             "novelty_placeholder")
            if isinstance(self.auxiliary_task, RandomNetworkDistillation):
                self.dyna_loss = tf.zeros([])
                self.novelty = self.auxiliary_task.get_novelty()
            elif isinstance(self.auxiliary_task,
                            InverseDynamics) or isinstance(
                                self.auxiliary_task, RandomFeature):
                with tf.variable_scope("loss"):
                    self.novelty = self._get_novelty()
                    self.dyna_loss = tf.reduce_mean(self.novelty)
            else:
                raise NotImplementedError

        self.dyna_params = tf.trainable_variables("dynamics")
        self.aux_params = tf.trainable_variables(self.auxiliary_task.scope)
        self.params = self.dyna_params + self.aux_params

        self.aux_loss = self.auxiliary_task.loss
        self.loss = self.aux_loss + self.dyna_loss

        self.queue = PriorityQueue(queue_size)
        self.novelty_rms = RunningMeanStd(epsilon=1e-4)
        self.novelty_normalized = tf.clip_by_value(
            (self.novelty_tf - self.novelty_rms.mean) / self.novelty_rms.std,
            -5., 5.)

        self.normalized = normalize_novelty
        if normalize_novelty:
            logger.info("normalize novelty")
        path = logger.get_dir()
        path = os.path.join(path, "goal_data")
        self.goal_recoder = DataRecorder(path)
        self.goal_store_baseline = 1500

        self.density_estimate = deque(maxlen=int(1e4))

        self.eval_interval = 20
        self.eval_data_status = {}
        self.eval_data = []
        path = logger.get_dir()
        self.eval_path = path = os.path.join(path, "novelty_evaluation")
        self.eval_recoder = DataRecorder(path)

        path = logger.get_dir()
        path = os.path.join(path, "error_goal")
        self.error_recoder = DataRecorder(path)

    def _get_novelty(self):
        if isinstance(self.ac_space, spaces.Box):
            assert len(self.ac_space.shape) == 1
        elif isinstance(self.ac_space, spaces.Discrete):
            ac = tf.one_hot(self.ac, self.ac_space.n)
        elif isinstance(self.ac_space, spaces.MultiDiscrete):
            raise NotImplementedError
        elif isinstance(self.ac_space, spaces.MultiBinary):
            ac = tf.one_hot(self.ac, self.ac_space.n)
        else:
            raise NotImplementedError

        def add_ac(x):
            return tf.concat([x, ac], axis=-1)

        hidsize = 512
        activ = tf.nn.leaky_relu
        x = fc(add_ac(self.feat), nh=hidsize, scope="fc_1")
        if activ is not None:
            x = activ(x)

        def residual(x, scope):
            res = fc(add_ac(x), nh=hidsize, scope=scope + "_1")
            res = tf.nn.leaky_relu(res)
            res = fc(add_ac(res), nh=hidsize, scope=scope + "_2")
            return x + res

        for _ in range(4):
            x = residual(x, scope="residual_{}".format(_ + 1))
        n_out_features = self.out_feat.get_shape()[-1].value
        x = fc(add_ac(x), nh=n_out_features, scope="output")
        return tf.reduce_mean(tf.square(x - self.out_feat), axis=-1)

    def put_goal(self, obs, actions, next_obs, goal_infos):
        assert list(obs.shape)[1:] == self.obs.get_shape().as_list(
        )[1:], "obs shape:{}.please flatten obs".format(obs.shape)
        assert list(actions.shape)[1:] == self.ac.get_shape().as_list(
        )[1:], "action shape:{}.please flatten actions".format(actions.shape)
        assert list(next_obs.shape)[1:] == self.next_obs.get_shape().as_list(
        )[1:], "next obs shape:{}.please flatten obs".format(next_obs.shape)
        assert len(goal_infos.shape) == 1, "info shape:{}".format(
            goal_infos.shape)

        # sample goal according to x_pos
        x_pos = [info["x_pos"] for info in goal_infos]
        for index, x in enumerate(x_pos):
            seg = x // self.eval_interval * self.eval_interval
            if seg not in self.eval_data_status:
                self.eval_data_status[seg] = False
            self.density_estimate.append(x)
            if not self.eval_data_status[seg]:
                self.eval_data.append({
                    "obs": obs[index],
                    "actions": actions[index],
                    "next_obs": next_obs[index],
                    "info": goal_infos[index]
                })
                self.eval_data_status[seg] = True
                self.eval_data = sorted(self.eval_data,
                                        key=lambda y: y["info"]["x_pos"])
        if np.max(x_pos) > self.goal_store_baseline:
            self.goal_recoder.store(self.eval_data)
            self.goal_recoder.dump()
            self.goal_store_baseline += 1000
            logger.info("store {} goal.now baseline:{}".format(
                len(self.eval_data), self.goal_store_baseline))
        # store goal into queue according to priority.
        novelty = self.sess.run(self.novelty,
                                feed_dict={
                                    self.obs: obs,
                                    self.next_obs: next_obs,
                                    self.ac: actions
                                })
        if self.normalized:
            self.novelty_rms.update(novelty)
            priority = -self.sess.run(self.novelty_normalized,
                                      feed_dict={self.novelty_tf: novelty})
        else:
            priority = -novelty
        stats = self._add_goal(obs, actions, next_obs, goal_infos, priority)
        return stats

    def get_goal(self, nb_goal, replace=True, alpha=1.0, beta=0.95):
        assert self.queue.qsize() >= nb_goal
        goal_priority, goal_feat, goal_obs, goal_act, goal_next_obs, goal_info = [], [], [], [], [], []
        while len(goal_obs) != nb_goal:
            data = self.queue.get()
            if (data[5]["x_pos"] <= 55) and (data[5]["y_pos"] <= 180):
                self.error_recoder.store(data)
                self.error_recoder.dump()
                logger.info("detecting an error goal:{} and remove it".format(
                    data[5]))
                continue
            goal_priority.append(data[0])
            goal_obs.append(data[2])
            goal_act.append(data[3])
            goal_next_obs.append(data[4])
            goal_info.append(data[5])
        goal_priority = np.asarray(goal_priority)
        # IMPORTANT: goal is next_obs in tuple.
        goals = np.asarray(goal_next_obs)
        if replace:
            goal_act = np.asarray(goal_act)
            goal_next_obs = np.asarray(goal_next_obs)
            novelty = self.sess.run(self.novelty,
                                    feed_dict={
                                        self.obs: goal_obs,
                                        self.ac: goal_act,
                                        self.next_obs: goal_next_obs
                                    })
            if self.normalized:
                self.novelty_rms.update(novelty)
                priority = -self.sess.run(self.novelty_normalized,
                                          feed_dict={self.novelty_tf: novelty})
            else:
                priority = -novelty

            priority = (1 - alpha) * goal_priority * beta + alpha * priority
            self._add_goal(goal_obs, goal_act, goal_next_obs, goal_info,
                           priority)
        assert list(goals.shape)[1:] == self.obs.get_shape().as_list(
        )[1:], "goal_obs:{}".format(goals.shape)
        return goals, goal_info

    def _add_goal(self, obs, actions, next_obs, infos, priority):
        baseline = None
        stats = dict()
        for i in range(len(priority)):
            if self.queue.qsize() < self.nenv * 5:
                data = (priority[i], time.time(), obs[i], actions[i],
                        next_obs[i], infos[i])
                self.queue.put(data)
            else:
                if baseline is None:
                    queue_p = [-item[0] for item in self.queue.queue]
                    stats["queue_max"], stats["queue_std"] = np.max(
                        queue_p), np.std(queue_p)
                    baseline = -0.75 * stats["queue_max"]
                if priority[i] < baseline:
                    data = (priority[i], time.time(), obs[i], actions[i],
                            next_obs[i], infos[i])
                    if self.queue.full():
                        maxvalue_idx = np.argmax(
                            [item[0] for item in self.queue.queue])
                        self.queue.queue.pop(maxvalue_idx)
                    self.queue.put(data)
        return stats

    def evaluate(self, steps, plot=False):
        if len(self.eval_data) > 0:
            obs, act, next_obs, x_pos = [], [], [], []
            for i in range(len(self.eval_data)):
                obs.append(self.eval_data[i]["obs"])
                act.append(self.eval_data[i]["actions"])
                next_obs.append(self.eval_data[i]["next_obs"])
                x_pos.append(self.eval_data[i]["info"]["x_pos"])
            obs = np.asarray(obs, dtype=np.float32)
            act = np.asarray(act, dtype=np.float32)
            next_obs = np.asarray(next_obs, dtype=np.float32)
            x_pos = np.asarray(x_pos, dtype=np.float32)
            novelty = self.sess.run(self.novelty,
                                    feed_dict={
                                        self.obs: obs,
                                        self.ac: act,
                                        self.next_obs: next_obs
                                    })
            p = pearsonr(x_pos, novelty)[0]
            if plot:
                plt.figure(dpi=80)
                plt.subplot(2, 1, 1)
                plt.scatter(x_pos, novelty)
                plt.title("pos & novelty")
                plt.yscale("log")
                plt.subplot(2, 1, 2)
                density = np.array(self.density_estimate)
                sns.kdeplot(density)
                plt.title("sample density")
                plt.savefig(
                    os.path.join(self.eval_path, "{}.png".format(steps)))
                plt.close()

            self.eval_recoder.store({
                "x_pos": x_pos,
                "novelty": novelty,
                "p": p,
                "steps": steps
            })
            self.eval_recoder.dump()

            return ["pos_novelty_p"], [p]
        else:
            return ["pos_novelty_p"], [np.nan]
示例#16
0
class Runner(AbstractEnvRunner):
    def __init__(self, env, model, nsteps, store_data, reward_fn, sample_goal, dist_type, alt_model=None,
                 use_random_policy_expl=None,):
        super().__init__(env=env, model=model, nsteps=nsteps)
        assert isinstance(env.action_space,
                          spaces.Discrete), 'This ACER implementation works only with discrete action spaces!'
        assert isinstance(env, VecFrameStack)

        self.nact = env.action_space.n
        nenv = self.nenv
        self.nbatch = nenv * nsteps
        self.batch_ob_shape = (nenv * (nsteps + 1),) + env.observation_space.shape

        # self.obs = env.reset()  super method do this
        self.obs_dtype = env.observation_space.dtype
        self.obs_shape = env.observation_space.shape
        self.ac_dtype = env.action_space.dtype
        self.ac_shape = env.action_space.shape
        self.nstack = self.env.nstack
        self.nc = self.batch_ob_shape[-1] // self.nstack
        self.goal_shape = self.model.goal_shape
        self.goal_as_image = self.model.goal_as_image

        self.save_path = os.path.join(logger.get_dir(), "runner_data")
        self.store_data = store_data
        self.recorder = DataRecorder(self.save_path)

        self.dynamics = self.model.dynamics
        self.sample_goal = sample_goal
        # self.batch_goal_feat_shape = (nenv*(nsteps+1),) + env.observation_space.shape + self.dynamics.feat_shape
        self.reached_status = np.array([False for _ in range(self.nenv)], dtype=bool)
        self.goals, self.goal_info = None, None
        self.reward_fn = reward_fn
        # self.results_writer = ResultsWriter(os.path.join(save_path, "evaluation.csv"))

        self.episode = np.ones(self.nenv)
        self.episode_step = np.zeros(self.nenv)
        self.episode_reached_step = np.zeros(self.nenv)
        self.episode_reward_to_go = np.zeros(self.nenv)

        self.name = self.model.scope.split("acer_")[1]

        assert dist_type in ["l1", "l2"]
        self.dist_type = dist_type
        self.alt_model = alt_model
        self.use_random_policy_expl = use_random_policy_expl
        if self.use_random_policy_expl:
            assert alt_model is not None

    def run(self, acer_step=None):
        if self.goals is None:
            self.goals, self.goal_info = self.dynamics.get_goal(nb_goal=self.nenv)
            if not self.goal_as_image:
                self.goals = self.goal_to_embedding(self.goal_info)
        # enc_obs = np.split(self.obs, self.nstack, axis=3)  # so now list of obs steps
        enc_obs = np.split(self.env.stackedobs, self.env.nstack, axis=-1)
        mb_obs = np.empty((self.nenv, self.nsteps + 1) + self.obs_shape, dtype=self.obs_dtype)
        mb_act = np.empty((self.nenv, self.nsteps) + self.ac_shape, dtype=self.ac_dtype)
        mb_mus = np.empty((self.nenv, self.nsteps, self.nact), dtype=np.float32)
        mb_dones = np.empty((self.nenv, self.nsteps), dtype=bool)
        mb_masks = np.empty((self.nenv, self.nsteps + 1), dtype=bool)
        mb_ext_rew = np.empty((self.nenv, self.nsteps), dtype=np.float32)
        mb_obs_infos = np.empty((self.nenv, self.nsteps), dtype=object)
        mb_goals = np.empty((self.nenv, self.nsteps + 1) + self.goal_shape, dtype=self.obs_dtype)
        mb_goal_infos = np.empty((self.nenv, self.nsteps), dtype=object)

        # mb_obs, mb_actions, mb_mus, mb_dones, mb_ext_rewards = [], [], [], [], []
        # mb_obs_infos, mb_goals, mb_goal_infos = [], [], []
        reached_step, done_step = np.array([None for _ in range(self.nenv)]), np.array([None for _ in range(self.nenv)])

        episode_infos = np.asarray([{} for _ in range(self.nenv)], dtype=object)
        for step in range(self.nsteps):
            try:
                check_obs(self.obs)
            except ValueError:
                logger.warn("acer_step:{}, runner_step:{}, empty obs".format(acer_step, step))
                raise ValueError
            actions, mus, states = self.model.step(self.obs, S=self.states, M=self.dones, goals=self.goals)
            if self.sample_goal:
                if self.use_random_policy_expl:
                    actions[self.reached_status] = self.simple_random_action(np.sum(self.reached_status))
                    mus[self.reached_status] = self.get_mu_of_random_action()
                else:
                    if np.sum(self.reached_status) > 0:
                        alt_action, alt_mu, alt_states = self.alt_model.step(self.obs, S=self.states, M=self.dones, goals=self.goals)
                        actions[self.reached_status] = alt_action[self.reached_status]
                        mus[self.reached_status] = alt_mu[self.reached_status]

            mb_obs[:, step] = deepcopy(self.obs)
            mb_act[:, step] = actions
            mb_mus[:, step, :] = mus
            mb_masks[:, step] = deepcopy(self.dones)

            obs, rewards, dones, infos = self.env.step(actions)
            try:
                check_infos(infos)
            except ValueError:
                logger.warn("warning!wrong infos!program continues anyway")
                logger.info("infos:{}, dones:{}, acer_step:{}".format(infos, dones, acer_step))
                logger.info("please debug it in runner_data/data.pkl")
                self.recorder.store(infos)
                self.recorder.dump()
            for info in infos:
                info.update({"source": self.name})

            enc_obs.append(obs[..., -self.nc:])
            mb_dones[:, step] = dones
            mb_ext_rew[:, step] = rewards
            self.episode_reward_to_go[self.reached_status] += rewards[self.reached_status]
            mb_obs_infos[:, step] = np.asarray(infos, dtype=object)
            mb_goals[:, step] = deepcopy(self.goals)
            mb_goal_infos[:, step] = deepcopy(self.goal_info)
            self.episode_step += 1
            # states information for statefull models like LSTM
            self.states = states
            self.dones = dones
            self.obs = obs

            # check reached
            if self.sample_goal:
                for env_idx in range(self.nenv):
                    if not self.reached_status[env_idx]:
                        if self.dist_type == "l1":
                            self.reached_status[env_idx] = self.check_goal_reached_v2(infos[env_idx],
                                                                                      self.goal_info[env_idx])
                        else:
                            raise NotImplementedError("I do not know how to compute goal_latent")
                        if self.reached_status[env_idx]:
                            reached_step[env_idx] = step
                            self.episode_reached_step[env_idx] = deepcopy(self.episode_step[env_idx])

            # check done
            done_step[self.dones] = step

            # revise goal
            if not self.sample_goal:
                for env_idx in range(self.nenv):
                    if self.dones[env_idx]:
                        # (- - done(t)) -> (done done, done(t))
                        start, end = 0, step + 1
                        if self.goal_as_image:
                            mb_goals[env_idx, start:end] = mb_obs[env_idx, step] 
                        else:
                            mb_goals[env_idx, start:end] = self.goal_to_embedding(infos[env_idx])
                        mb_goal_infos[env_idx, start:end] = infos[env_idx]
                    elif step == self.nsteps - 1:
                        if done_step[env_idx] is None:
                            # (- - t) -> (t, t, t)
                            start = 0
                        else:
                            # (- - done - - t) -> (- - - t, t, t)
                            start = done_step[env_idx] + 1
                        end = step + 1
                        if end == start:
                            continue
                        if self.goal_as_image:
                            mb_goals[env_idx, start:end] = mb_obs[env_idx, step]
                        else:
                            mb_goals[env_idx, start:end] = self.goal_to_embedding(infos[env_idx])
                        mb_goal_infos[env_idx, start:end] = infos[env_idx]
            else:
                for env_idx in range(self.nenv):
                    if step != self.nsteps - 1:
                        # dones is instant variable but reached_status is a transitive variable
                        if self.dones[env_idx] and self.reached_status[env_idx]:
                            if reached_step[env_idx] is None:
                                # reach|[- - done] -> [done, done, done]
                                start, end = 0, step + 1
                                if self.goal_as_image:
                                    mb_goals[env_idx, start:end] = mb_obs[env_idx, step]
                                else:
                                    mb_goals[env_idx, start:end] = self.goal_to_embedding(infos[env_idx])
                                mb_goal_infos[env_idx, start:end] = infos[env_idx]
                            else:
                                # [- - reach(done)] -> [ - - -]  if reached_step[env_idx] == step
                                # [- - reach - - done] -> [- - - done done done]
                                start, end = reached_step[env_idx] + 1, step + 1
                                if end == start:
                                    continue
                                if self.goal_as_image:
                                    mb_goals[env_idx, start:end] = mb_obs[env_idx, step]
                                else:
                                    mb_goals[env_idx, start:end] = self.goal_to_embedding(infos[env_idx])
                                mb_goal_infos[env_idx, start:end] = infos[env_idx]
                        elif not self.dones[env_idx] and self.reached_status[env_idx]:
                            # reached|[ - - -]  if reached_step[env_idx] is None:
                            # [- - reached - -] if reached_step[env_idx] is not None
                            pass
                        else:
                            # [- - - done] if self.dones[env_idx] and not self.reached_status[env_idx]
                            # [- - - - -] if not self.dones[env_idx] and not self.reached_status[env_idx]
                            pass
                    else:
                        if self.dones[env_idx] and self.reached_status[env_idx]:
                            if reached_step[env_idx] is None:
                                # reach|[- - done(t)] -> [done, done, done(t)]
                                start, end = 0, step + 1
                                if self.goal_as_image:
                                    mb_goals[env_idx, start:end] = mb_obs[env_idx, step]
                                else:
                                    mb_goals[env_idx, start:end] = self.goal_to_embedding(infos[env_idx])
                                mb_goal_infos[env_idx, start:end] = infos[env_idx]
                            else:
                                # [- - reach(done)(t)] -> [- - -]
                                # [- - reach - - done(t)] -> [- - - done done done(t)]
                                start, end = reached_step[env_idx] + 1, step + 1
                                if end == start:
                                    continue
                                if self.goal_as_image:
                                    mb_goals[env_idx, start:end] = mb_obs[env_idx, step]
                                else:
                                    mb_goals[env_idx, start:end] = self.goal_to_embedding(infos[env_idx])
                                mb_goal_infos[env_idx, start:end] = infos[env_idx]
                        elif not self.dones[env_idx] and self.reached_status[env_idx]:
                            if reached_step[env_idx] is None:
                                # reached|[ - - t]  -> reached|[t t t]
                                start, end = 0, step + 1
                            else:
                                # reached[- - r - -] -> reached|[- - - t t]
                                start, end = reached_step[env_idx] + 1, step + 1
                            if end == start:
                                continue
                            if self.goal_as_image:
                                mb_goals[env_idx, start:end] = mb_obs[env_idx, step]
                            else:
                                mb_goals[env_idx, start:end] = self.goal_to_embedding(infos[env_idx])
                        else:
                            # [- - - done(t)]  if self.dones[env_idx] and not self.reached_status[env_idx]
                            # [- - - - (t)] if not self.dones[env_idx] and not self.reached_status[env_idx]
                            pass
            # summary
            for env_idx in range(self.nenv):
                info = infos[env_idx]
                if self.dones[env_idx]:
                    assert info.get("episode")
                    if info.get("episode"):
                        episode_infos[env_idx]["episode"] = info.get("episode")
                    if not self.sample_goal:
                        episode_infos[env_idx]["reached_info"] = dict(source=self.name,
                                                                      x_pos=infos[env_idx]["x_pos"],
                                                                      y_pos=infos[env_idx]["y_pos"])
                    else:
                        if self.reached_status[env_idx]:
                            reached = 1.0
                            time_ratio = self.episode_reached_step[env_idx] / self.episode_step[env_idx]
                            achieved_pos = {"x_pos": infos[env_idx]["x_pos"], "y_pos": infos[env_idx]["y_pos"]}
                            mem = dict(env=env_idx, is_succ=True, goal=self.goal_info[env_idx], final_pos=achieved_pos,
                                       timestep=acer_step, episode=self.episode[env_idx], step=self.episode_step[env_idx])
                            self.recorder.store(mem)
                            self.log(mem)
                            abs_dist = 10
                        else:
                            reached = 0.0
                            time_ratio = 1.0
                            achieved_pos = {"x_pos": infos[env_idx]["x_pos"], "y_pos": infos[env_idx]["y_pos"]}
                            mem = dict(env=env_idx, is_succ=False, goal=self.goal_info[env_idx], final_pos=achieved_pos,
                                       timestep=acer_step, episode=self.episode[env_idx], step=self.episode_step[env_idx])
                            self.recorder.store(mem)
                            self.log(mem)
                            abs_dist = abs(float(infos[env_idx]["x_pos"]) - float(self.goal_info[env_idx]["x_pos"])) + \
                                       abs(float(infos[env_idx]["y_pos"]) - float(self.goal_info[env_idx]["y_pos"]))
                        episode_infos[env_idx]["reached_info"] = dict(reached=reached, time_ratio=time_ratio,
                                                                      abs_dist=abs_dist, source=self.name,
                                                                      x_pos=infos[env_idx]["x_pos"],
                                                                      y_pos=infos[env_idx]["y_pos"])
                        episode_infos[env_idx]["goal_info"] = dict(x_pos=self.goal_info[env_idx]["x_pos"],
                                                                   y_pos=self.goal_info[env_idx]["y_pos"],
                                                                   source=self.goal_info[env_idx]["source"],
                                                                   reward_to_go=self.episode_reward_to_go[env_idx])
                        # re-plan goal
                        goal_obs, goal_info = self.dynamics.get_goal(nb_goal=1)
                        if self.goal_as_image:
                            self.goals[env_idx] = goal_obs[0]
                        else:
                            self.goals[env_idx] = self.goal_to_embedding(goal_info[0])
                        self.goal_info[env_idx] = goal_info[0]
                        self.episode[env_idx] += 1
                        self.episode_step[env_idx] = 0
                        self.episode_reached_step[env_idx] = 0
                        self.reached_status[env_idx] = False
                        self.episode_reward_to_go[env_idx] = 0

        # next obs and next goal
        mb_obs[:, -1] = deepcopy(self.obs)
        mb_goals[:, -1] = mb_goals[:, -2]  # we cannot use self.goal since it way be revised

        if self.dist_type == "l2":
            raise NotImplementedError
        else:
            mb_int_rewards = self.reward_fn(mb_obs_infos, mb_goal_infos)
        # shapes are adjusted to [nenv, nsteps, []]
        enc_obs = np.asarray(enc_obs, dtype=self.obs_dtype).swapaxes(1, 0)

        self.recorder.dump()
        results = dict(
            enc_obs=enc_obs,
            obs=mb_obs,
            actions=mb_act,
            ext_rewards=mb_ext_rew,
            mus=mb_mus,
            dones=mb_dones,
            masks=mb_masks,
            obs_infos=mb_obs_infos,  # nenv, nsteps, two purpose: 1)put into dynamics; 2) put into buffer
            episode_infos=episode_infos,
            goal_obs=mb_goals,  # nenv, nsteps+1,
            goal_infos=mb_goal_infos,
            int_rewards=mb_int_rewards
        )
        return results

    def check_goal_reached(self, obs_feat, desired_goal):
        assert obs_feat.shape == desired_goal.shape
        assert len(obs_feat.shape) == 1
        if self.dynamics.dummy:
            return False
        else:
            eps = 1e-6
            tol = 0.03
            status = (np.square(obs_feat - desired_goal).sum() / (np.square(desired_goal).sum() + eps)) < tol
            return status

    @staticmethod
    def check_goal_reached_v2(obs_info, goal_info):
        eps = 20
        obs_x, obs_y = float(obs_info["x_pos"]), float(obs_info["y_pos"])
        goal_x, goal_y = float(goal_info["x_pos"]), float(goal_info["y_pos"])
        dist = abs(obs_x - goal_x) + abs(obs_y - goal_y)
        if dist < eps:
            status = True
        else:
            status = False
        return status

    def simple_random_action(self, nb_action):
        return np.random.randint(0, self.env.action_space.n, nb_action)

    def get_mu_of_random_action(self):
        assert isinstance(self.env.action_space, spaces.Discrete)
        return np.array([1 / self.env.action_space.n for _ in range(self.env.action_space.n)])

    @staticmethod
    def goal_to_embedding(goal_infos):
        feat_dim = 512
        nb_tile = feat_dim // 2
        if isinstance(goal_infos, dict):
            goal_embedding = np.array([goal_infos["x_pos"], goal_infos["y_pos"]], dtype=np.float32).reshape(1, 2)
            goal_embedding = np.tile(goal_embedding, [1]*len(goal_embedding.shape[:-1])+[nb_tile])
            return goal_embedding
        
        def get_pos(x):
            return float(x["x_pos"]), float(x["y_pos"])
        vf = np.vectorize(get_pos)
        goal_pos = vf(goal_infos)
        goal_x, goal_y = np.expand_dims(goal_pos[0], -1).astype(np.float32), np.expand_dims(goal_pos[1], -1).astype(np.float32)
        goal_embedding = np.concatenate([goal_x, goal_y], axis=-1)
        goal_embedding = np.tile(goal_embedding, [1]*len(goal_embedding.shape[:-1])+[nb_tile])
        return goal_embedding

    def initialize(self, init_steps):
        mb_obs, mb_actions, mb_next_obs, mb_goal_infos = [], [], [], []
        for _ in range(init_steps):
            mb_obs.append(deepcopy(self.obs))
            actions = np.asarray([self.env.action_space.sample() for _ in range(self.nenv)])
            self.obs, rewards, dones, infos = self.env.step(actions)
            goal_infos = np.array([{"x_pos": info.get("x_pos", None),
                                    "y_pos": info.get("y_pos", None),
                                    "source": self.name} for info in infos], dtype=object)
            mb_goal_infos.append(goal_infos)
            mb_actions.append(actions)
            mb_next_obs.append(deepcopy(self.obs))
        mb_obs = np.asarray(mb_obs).swapaxes(1, 0)  # (nenv, nstep, obs_shape)
        mb_goal_infos = np.asarray(mb_goal_infos, dtype=object).swapaxes(1, 0)  # (nenv, nstep, dict)
        mb_actions = np.asarray(mb_actions).swapaxes(1, 0)
        mb_next_obs = np.asarray(mb_next_obs).swapaxes(1, 0)

        batch_size = min(128, init_steps)
        ind = np.random.randint(0, init_steps, batch_size)
        mb_obs = mb_obs.reshape((-1,) + mb_obs.shape[2:])[ind]
        mb_goal_infos = mb_goal_infos.reshape(-1, )[ind]
        mb_actions = mb_actions.reshape((-1,) + mb_actions.shape[2:])[ind]
        mb_next_obs = mb_next_obs.reshape((-1,) + mb_next_obs.shape[2:])[ind]

        for i in range(10):
            self.model.train_dynamics(mb_obs, mb_actions, mb_next_obs, 0)
        self.dynamics.put_goal(mb_obs, mb_actions, mb_next_obs, mb_goal_infos)
        self.obs = self.env.reset()

    def evaluate(self, nb_eval):
        assert self.dynamics.dummy
        goal_obs, goal_info = self.dynamics.get_goal(nb_goal=self.nenv)  # (nenv, goal_dim)
        eval_info = {"l": 0, "r": 0}
        for i in range(nb_eval):
            terminal = False
            while True:
                actions, mus, states = self.model.step(self.obs, S=self.states, M=self.dones, goals=goal_obs)
                obs, rewards, dones, infos = self.env.step(actions)
                info = infos[0]
                if info.get("episode"):
                    assert dones[0]
                    eval_info["l"] += info.get("episode")["l"]
                    eval_info["r"] += info.get("episode")["r"]
                    terminal = True
                if terminal:
                    break
                self.states = states
                self.dones = dones
                self.obs = obs
        self.obs = self.env.reset()
        eval_info["l"] /= nb_eval
        eval_info["r"] /= nb_eval
        return eval_info

    def log(self, mem):
        succ = "succ" if mem["is_succ"] else "fail"
        template = "env_{} {}|goal:{}|final_pos:{}|size:{}".format(
            mem["env"], succ, {"x_pos": mem["goal"]["x_pos"], "y_pos": mem["goal"]["y_pos"]},
            mem["final_pos"], self.dynamics.queue.qsize()
        )
        logger.info(template)