示例#1
0
    def __init__(self, **kwargs):
        self.env = kwargs.get('env')
        super().__init__(self.env)
        self.actor_lr = kwargs.get('actor_lr', 1e-4)
        self.critic_lr = kwargs.get('critic_lr', 2e-4)
        self.entropy = kwargs.get('entropy', 1e-4)
        self.actor_units = kwargs.get('actor_units', [128])
        self.critic_units = kwargs.get('critic_units', [128])
        self.horizon = kwargs.get('horizon', 64)
        self.update_rate = kwargs.get('update_rate', 2048)
        self.batch_size = kwargs.get('batch_size', 64)
        self.epoch = kwargs.get('epoch', 4)
        self.clip = kwargs.get('clip', 0.2)
        self.gamma = kwargs.get('gamma', 0.995)
        self.lambd = kwargs.get('lambd', 0.97)
        self.resize = kwargs.get('resize', 84)
        self.seqlen = kwargs.get('seqlen', 1)
        self.state_shape = []

        self.gamlam = self.gamma * self.lambd
        self.memory = HorizonMemory()
        self.replay = BatchMemory()

        self.actor = tf.keras.models.Model()
        self.critic = tf.keras.models.Model()
        self.preprocess_obs = (lambda a, b=None, c=None, d=None, e=None: a)
        self.entropy_func = (lambda a, b: a)
        self.log_pi_func = (lambda a, b: a)
示例#2
0
    def __init__(self, vail_sample, reward_shift, reward_aug, gae_norm,
                 global_norm, actor_lr, critic_lr, disc_lr, actor_units,
                 critic_units, disc_units, disc_reduce_units, gamma, lambd,
                 clip, entropy, epochs, batch_size, update_rate, data_dir,
                 demo_list):
        # build network
        self.actor = Actor(lr=actor_lr, hidden_units=actor_units)
        self.critic = Critic(lr=critic_lr, hidden_units=critic_units)
        self.discriminator = Discriminator(lr=disc_lr,
                                           hidden_units=disc_units,
                                           reduce_units=disc_reduce_units)
        self.encoder = VAE_Encoder(latent_num=64)

        # set hyperparameters
        self.vail_sample = vail_sample
        self.reward_shift = reward_shift
        self.reward_aug = reward_aug
        self.gae_norm = gae_norm
        self.gamma = gamma
        self.lambd = lambd
        self.gam_lam = gamma * lambd
        self.clip = clip
        self.entropy = entropy
        self.epochs = epochs
        self.batch_size = batch_size
        self.half_batch_size = batch_size // 2
        self.update_rate = update_rate
        self.grad_global_norm = global_norm
        self.beta = BETA_INIT

        # build memory
        self.memory = HorizonMemory(use_reward=reward_aug)
        self.replay = ReplayMemory()

        # build expert demonstration Pipeline
        self.data_dir = data_dir
        self.demo_list = os.listdir(data_dir)
        self.demo_group_num = 500
        self.demo_rotate = 5
        assert len(demo_list) >= self.demo_group_num
        self.set_demo()

        # ready
        self.dummy_forward()
        self.actor_vars = self.actor.trainable_variables + self.encoder.trainable_variables
        self.critic_vars = self.critic.trainable_variables + self.encoder.trainable_variables
        self.disc_vars = self.discriminator.trainable_variables + self.encoder.trainable_variables
示例#3
0
    def __init__(self, reward_shift, actor_units, critic_units, disc_units, disc_reduce_units, code_units):
        # build network
        self.actor  = Actor(lr=0, hidden_units=actor_units)
        self.critic = Critic(lr=0, hidden_units=critic_units)
        self.discriminator = Discriminator(
            lr=0, hidden_units=disc_units, reduce_units=disc_reduce_units)
        self.encoder = VAE_Encoder(latent_num=64)
        self.prior = DiscretePosterior(lr=0, hidden_units=code_units)

        # set hyperparameters
        self.reward_shift = reward_shift
        self.memory = HorizonMemory()

        # ready
        self.dummy_forward()
示例#4
0
class GAIL:
    def __init__(self, vail_sample, reward_shift, reward_aug, gae_norm,
                 global_norm, actor_lr, critic_lr, disc_lr, actor_units,
                 critic_units, disc_units, disc_reduce_units, gamma, lambd,
                 clip, entropy, epochs, batch_size, update_rate, data_dir,
                 demo_list):
        # build network
        self.actor = Actor(lr=actor_lr, hidden_units=actor_units)
        self.critic = Critic(lr=critic_lr, hidden_units=critic_units)
        self.discriminator = Discriminator(lr=disc_lr,
                                           hidden_units=disc_units,
                                           reduce_units=disc_reduce_units)
        self.encoder = VAE_Encoder(latent_num=64)

        # set hyperparameters
        self.vail_sample = vail_sample
        self.reward_shift = reward_shift
        self.reward_aug = reward_aug
        self.gae_norm = gae_norm
        self.gamma = gamma
        self.lambd = lambd
        self.gam_lam = gamma * lambd
        self.clip = clip
        self.entropy = entropy
        self.epochs = epochs
        self.batch_size = batch_size
        self.half_batch_size = batch_size // 2
        self.update_rate = update_rate
        self.grad_global_norm = global_norm
        self.beta = BETA_INIT

        # build memory
        self.memory = HorizonMemory(use_reward=reward_aug)
        self.replay = ReplayMemory()

        # build expert demonstration Pipeline
        self.data_dir = data_dir
        self.demo_list = os.listdir(data_dir)
        self.demo_group_num = 500
        self.demo_rotate = 5
        assert len(demo_list) >= self.demo_group_num
        self.set_demo()

        # ready
        self.dummy_forward()
        self.actor_vars = self.actor.trainable_variables + self.encoder.trainable_variables
        self.critic_vars = self.critic.trainable_variables + self.encoder.trainable_variables
        self.disc_vars = self.discriminator.trainable_variables + self.encoder.trainable_variables

    def dummy_forward(self):
        # connect networks
        dummy_state = np.zeros([1] + STATE_SHAPE, dtype=np.float32)
        dummy_action = np.zeros([1] + ACTION_SHAPE, dtype=np.float32)
        self.encoder(dummy_state)
        self.actor(self.encoder, dummy_state)
        self.critic(self.encoder, dummy_state)
        self.discriminator(self.encoder, dummy_state, dummy_action)

    def set_demo(self):
        self.demo_list = os.listdir(data_dir)
        selected_demos = random.sample(self.demo_list, self.demo_group_num)

        expert_states = []
        expert_actions = []
        for demo_name in selected_demos:
            demo = np.load(self.data_dir + demo_name)
            states = demo['state']
            actions = demo['action']

            expert_states.append(states)
            expert_actions.append(actions)
        self.expert_states = np.concatenate(expert_states, axis=0)
        self.expert_actions = np.concatenate(expert_actions, axis=0)
        del demo

    def get_demonstration(self, sample_num):
        index = np.arange(len(self.expert_states))
        try:
            assert len(self.expert_states) >= sample_num
        except Exception:
            self.set_demo()
        np.random.shuffle(index)
        index = index[:sample_num]
        return self.expert_states[index], self.expert_actions[index]

    def memory_process(self, next_state, done):
        # [[(1,64,64,3)], [], ...], [[(1,2),(1,9),(1,3),(1,4)], [], ...], [[c_pi, d_pi, s_pi, a_pi], [], ...]
        if self.reward_aug:
            states, actions, log_old_pis, rewards = self.memory.rollout()
        else:
            states, actions, log_old_pis = self.memory.rollout()
        np_states = np.concatenate(states + [next_state], axis=0)
        np_actions = np.concatenate(actions, axis=0)

        np_rewards = self.get_reward(np_states[:-1], np_actions)  # (N, 1)
        if self.reward_aug:
            np_env_rewards = np.stack(rewards, axis=0).reshape(-1, 1)
            np_rewards = np_rewards + np_env_rewards
        gae, oracle = self.get_gae_oracle(np_states, np_rewards,
                                          done)  # (N, 1), (N, 1)
        self.replay.append(states, actions, log_old_pis, gae, oracle)
        self.memory.flush()
        if len(self.replay) >= self.update_rate:
            self.update()
            self.replay.flush()

    def get_action(self, state):
        policy = self.actor(self.encoder, state).numpy()[0]
        action = np.random.choice(ACTION_NUM, p=policy)
        # action = np.argmax(policy)
        action_one_hot = np.eye(ACTION_NUM,
                                dtype=np.float32)[[action]]  # (1, 4)
        log_old_pi = [[np.log(policy[action] + 1e-8)]]  # (1, 1)
        return action, action_one_hot, log_old_pi, policy

    def get_reward(self, states, actions):
        d = self.discriminator(self.encoder, states, actions).numpy()  # (N, 1)
        # rewards = 0.5 - d       # linear reward
        # rewards = np.tan(0.5 - d)     # tan reward
        if self.reward_shift:
            rewards = -np.log(2.0 * d + 1e-8)  # log equil reward
        else:
            rewards = -np.log(d + 1e-8)  # log reward
        # rewards = 0.1 * np.where(rewards>1, 1, rewards)
        return rewards

    def get_gae_oracle(self, states, rewards, done):
        # states include next state
        values = self.critic(self.encoder, states).numpy()  # (N+1, 1)
        if done:
            values[-1] = np.float32([0])
        N = len(rewards)
        gae = 0
        gaes = np.zeros((N, 1), dtype=np.float32)
        oracles = np.zeros((N, 1), dtype=np.float32)
        for t in reversed(range(N)):
            oracles[t] = rewards[t] + self.gamma * values[t + 1]
            delta = oracles[t] - values[t]
            gae = delta + self.gam_lam * gae
            gaes[t][0] = gae

        # oracles = gaes + values[:-1]        # (N, 1)
        if self.gae_norm:
            gaes = (gaes - np.mean(gaes)) / (np.std(gaes) + 1e-8)
        return gaes, oracles

    def update(self):
        # load & calculate data
        states, actions, log_old_pis, gaes, oracles \
            = self.replay.rollout()

        states = np.concatenate(states, axis=0)
        actions = np.concatenate(actions, axis=0)
        log_old_pis = np.concatenate(log_old_pis, axis=0)
        gaes = np.concatenate(gaes, axis=0)
        oracles = np.concatenate(oracles, axis=0)
        N = len(states)
        # update discriminator
        # load expert demonstration
        s_e, a_e = self.get_demonstration(N)

        batch_num = N // self.half_batch_size
        index = np.arange(N)
        np.random.shuffle(index)
        for i in range(batch_num):
            idx = index[i * self.half_batch_size:(i + 1) *
                        self.half_batch_size]
            s_concat = np.concatenate([states[idx], s_e[idx]], axis=0)
            a_concat = np.concatenate([actions[idx], a_e[idx]], axis=0)

            with tf.GradientTape(persistent=True) as tape:
                mu, std, sampled = self.discriminator.encode(
                    self.encoder, s_concat, a_concat)

                discs = self.discriminator.decode(
                    sampled if self.vail_sample else mu)
                kld_loss = tf.reduce_mean(tf_gaussian_KL(mu, 0, std, 1))
                agent_loss = -tf.reduce_mean(
                    tf.math.log(discs[:self.half_batch_size] + 1e-8))
                expert_loss = -tf.reduce_mean(
                    tf.math.log(1 + 1e-8 - discs[self.half_batch_size:]))
                disc_loss = agent_loss + expert_loss
                discriminator_loss = disc_loss + self.beta * kld_loss
            disc_grads = tape.gradient(discriminator_loss, self.disc_vars)
            if self.grad_global_norm > 0:
                disc_grads, _ = tf.clip_by_global_norm(disc_grads,
                                                       self.grad_global_norm)
            self.discriminator.opt.apply_gradients(
                zip(disc_grads, self.disc_vars))
            del tape

        # TODO: update posterior
        # L1 loss = logQ(code|s,prev_a,prev_code)
        # update actor & critic
        # batch_num = math.ceil(len(states) / self.batch_size)
        batch_num = len(gaes) // self.batch_size
        index = np.arange(len(gaes))
        for _ in range(self.epochs):
            np.random.shuffle(index)
            for i in range(batch_num):
                # if i == batch_num - 1:
                #     idx = index[i*self.batch_size : ]
                # else:
                idx = index[i * self.batch_size:(i + 1) * self.batch_size]
                state = states[idx]
                action = actions[idx]
                log_old_pi = log_old_pis[idx]
                gae = gaes[idx]
                oracle = oracles[idx]

                # update critic
                with tf.GradientTape(persistent=True) as tape:
                    values = self.critic(self.encoder, state)  # (N, 1)
                    critic_loss = tf.reduce_mean(
                        (oracle - values)**2)  # MSE loss
                critic_grads = tape.gradient(critic_loss, self.critic_vars)
                if self.grad_global_norm > 0:
                    critic_grads, _ = tf.clip_by_global_norm(
                        critic_grads, self.grad_global_norm)
                self.critic.opt.apply_gradients(
                    zip(critic_grads, self.critic_vars))
                del tape

                # update actor
                with tf.GradientTape(persistent=True) as tape:
                    pred_action = self.actor(self.encoder, state)

                    # RL (PPO) term
                    log_pi = tf.expand_dims(tf.math.log(
                        tf.reduce_sum(pred_action * action, axis=1) + 1e-8),
                                            axis=1)  # (N, 1)
                    ratio = tf.exp(log_pi - log_old_pi)
                    clip_ratio = tf.clip_by_value(ratio, 1 - self.clip,
                                                  1 + self.clip)
                    clip_loss = -tf.reduce_mean(
                        tf.minimum(ratio * gae, clip_ratio * gae))
                    entropy = tf.reduce_mean(tf.exp(log_pi) * log_pi)
                    actor_loss = clip_loss + self.entropy * entropy

                actor_grads = tape.gradient(
                    actor_loss, self.actor_vars)  # NOTE: freeze posterior
                if self.grad_global_norm > 0:
                    actor_grads, _ = tf.clip_by_global_norm(
                        actor_grads, self.grad_global_norm)
                self.actor.opt.apply_gradients(
                    zip(actor_grads, self.actor_vars))

                del tape
            # print('%d samples trained... D loss: %.4f C loss: %.4f A loss: %.4f\t\t\t'
            #     % (len(gaes), disc_loss, critic_loss, actor_loss), end='\r')

    def save_model(self, dir, tag=''):
        self.actor.save_weights(dir + tag + 'actor.h5')
        self.critic.save_weights(dir + tag + 'critic.h5')
        self.discriminator.save_weights(dir + tag + 'discriminator.h5')
        self.encoder.save_weights(dir + tag + 'encoder.h5')

    def load_model(self, dir, tag=''):
        if os.path.exists(dir + tag + 'actor.h5'):
            self.actor.load_weights(dir + tag + 'actor.h5')
            print('Actor loaded... %s%sactor.h5' % (dir, tag))
        if os.path.exists(dir + tag + 'critic.h5'):
            self.critic.load_weights(dir + tag + 'critic.h5')
            print('Critic loaded... %s%scritic.h5' % (dir, tag))
        if os.path.exists(dir + tag + 'discriminator.h5'):
            self.discriminator.load_weights(dir + tag + 'discriminator.h5')
            print('Discriminator loaded... %s%sdiscriminator.h5' % (dir, tag))
        if os.path.exists(dir + tag + 'encoder.h5'):
            self.encoder.load_weights(dir + tag + 'encoder.h5')
            print('encoder loaded... %s%sencoder.h5' % (dir, tag))

    def load_encoder(self, dir, tag=''):
        if os.path.exists(dir + tag + 'encoder.h5'):
            self.encoder.load_weights(dir + tag + 'encoder.h5')
            print('encoder loaded... %s%sencoder.h5' % (dir, tag))
示例#5
0
class PPO(Agent):
    def __init__(self, **kwargs):
        self.env = kwargs.get('env')
        super().__init__(self.env)
        self.actor_lr = kwargs.get('actor_lr', 1e-4)
        self.critic_lr = kwargs.get('critic_lr', 2e-4)
        self.entropy = kwargs.get('entropy', 1e-4)
        self.actor_units = kwargs.get('actor_units', [128])
        self.critic_units = kwargs.get('critic_units', [128])
        self.horizon = kwargs.get('horizon', 64)
        self.update_rate = kwargs.get('update_rate', 2048)
        self.batch_size = kwargs.get('batch_size', 64)
        self.epoch = kwargs.get('epoch', 4)
        self.clip = kwargs.get('clip', 0.2)
        self.gamma = kwargs.get('gamma', 0.995)
        self.lambd = kwargs.get('lambd', 0.97)
        self.resize = kwargs.get('resize', 84)
        self.seqlen = kwargs.get('seqlen', 1)
        self.state_shape = []

        self.gamlam = self.gamma * self.lambd
        self.memory = HorizonMemory()
        self.replay = BatchMemory()

        self.actor = tf.keras.models.Model()
        self.critic = tf.keras.models.Model()
        self.preprocess_obs = (lambda a, b=None, c=None, d=None, e=None: a)
        self.entropy_func = (lambda a, b: a)
        self.log_pi_func = (lambda a, b: a)

    def dummy_forward(self):
        dummy_s = np.zeros([1] + self.state_shape, dtype=np.float32)
        self.actor(dummy_s)
        self.critic(dummy_s)

    def load_model(self, path):
        actor_path = os.path.join(path, 'actor.h5')
        critic_path = os.path.join(path, 'critic.h5')
        std_path = os.path.join(path, 'std.npy')
        if os.path.exists(actor_path):
            self.actor.load_weights(actor_path)
            if os.path.exists(std_path):
                self.actor.log_std.assign(np.load(std_path))
            print('Actor Loaded... ', actor_path)
        if os.path.exists(critic_path):
            self.critic.load_weights(critic_path)
            print('Critic Loaded... ', critic_path)

    def save_model(self, path):
        actor_path = os.path.join(path, 'actor.h5')
        critic_path = os.path.join(path, 'critic.h5')
        std_path = os.path.join(path, 'std.npy')
        self.actor.save_weights(actor_path)
        try:
            np.save(std_path, self.actor.log_std.numpy())
        except:
            pass
        self.critic.save_weights(critic_path)

    def get_action(self, state):
        raise NotImplementedError

    def append_horizon(self, state, action, reward, log_pi):
        self.memory.append(state, action, reward, log_pi)

    def memory_process(self, next_state, done):
        a_loss, c_loss = None, None
        states, actions, log_pis, rewards = self.memory.rollout()
        gaes, targets = self.get_gae_target(states, rewards, next_state, done)

        self.replay.append(states, actions, log_pis, gaes, targets)
        self.memory.flush()
        if len(self.replay) >= self.update_rate:
            a_loss, c_loss = self.train()
            self.replay.flush()
        return a_loss, c_loss

    def get_gae_target(self, states, rewards, next_state, done):
        states = np.concatenate(states + [next_state], axis=0)
        values = self.critic(states).numpy()

        gaes = np.zeros_like(rewards, dtype=np.float32).reshape(-1, 1)
        targets = np.zeros_like(gaes)

        gae = 0
        if done:
            values[-1][0] = 0.

        for t in reversed(range(len(gaes))):
            targets[t] = rewards[t] + self.gamma * values[t + 1]
            delta = targets[t] - values[t]
            gaes[t] = delta + self.gamlam * gae
        targets = values[:-1] + gaes
        gaes = (gaes - np.mean(gaes)) / (np.std(gaes) + 1e-8)
        return gaes.tolist(), targets.tolist()

    def train(self):
        states, actions, log_pis, gaes, targets \
            = self.replay.rollout()
        states = np.concatenate(states, axis=0)
        actions = np.concatenate(actions, axis=0)
        log_pis = np.concatenate(log_pis, axis=0).reshape(-1, 1)
        gaes = np.concatenate(gaes, axis=0).reshape(-1, 1)
        targets = np.concatenate(targets, axis=0).reshape(-1, 1)

        actor_losses, critic_losses = 0., 0.

        idx = np.arange(len(states))
        batch_num = len(states) // self.batch_size
        for _ in range(self.epoch):
            np.random.shuffle(idx)
            for i in range(batch_num):
                s_b = states[i * self.batch_size:(i + 1) *
                             self.batch_size]  # (N, S...)
                a_b = actions[i * self.batch_size:(i + 1) *
                              self.batch_size]  # (N, A)
                l_b = log_pis[i * self.batch_size:(i + 1) *
                              self.batch_size]  # (N, A)
                g_b = gaes[i * self.batch_size:(i + 1) *
                           self.batch_size]  # (N, 1)
                t_b = targets[i * self.batch_size:(i + 1) *
                              self.batch_size]  # (N, 1)
                # update critic
                with tf.GradientTape() as tape:
                    values = self.critic(s_b)
                    critic_loss = tf.reduce_mean((values - t_b)**2)
                critic_grads = tape.gradient(critic_loss,
                                             self.critic.trainable_variables)

                # update actor
                with tf.GradientTape() as tape:
                    pred = self.actor(s_b)
                    log_pi = self.log_pi_func(a_b, pred)
                    ratio = tf.exp(log_pi - l_b)
                    clipped = tf.clip_by_value(ratio, 1 - self.clip,
                                               1 + self.clip)
                    clip_loss = -tf.reduce_mean(
                        tf.minimum(ratio * g_b, clipped * g_b))
                    entropy = self.entropy_func(pred, log_pi)

                    actor_loss = clip_loss - self.entropy * entropy
                actor_grads = tape.gradient(actor_loss,
                                            self.actor.trainable_variables)

                self.critic.opt.apply_gradients(
                    zip(critic_grads, self.critic.trainable_variables))
                self.actor.opt.apply_gradients(
                    zip(actor_grads, self.actor.trainable_variables))

                actor_losses += actor_loss.numpy()
                critic_losses += critic_loss.numpy()
        train_num = batch_num * self.epoch
        return actor_losses / train_num, critic_losses / train_num

    def play(self,
             render=False,
             verbose=False,
             delay=0,
             ep_label=0,
             test=False,
             sparsify=True):
        done = False
        score, true_score = 0., 0.
        step = 0
        horizon_step = 0

        a_losses, c_losses, = [], []
        pmax = 0

        obs = self.env.reset()
        if sparsify:
            pos = int(self.env.robot.body_xyz[0])
        state = self.preprocess_obs(obs, self.env, self.resize, self.seqlen)
        if render:
            self.env.render()

        while not done:
            time.sleep(delay)
            real_action, action, log_pi, policy = self.get_action(state)

            if verbose:
                stamp = '[EP%dT%d] [Rew] %.2f (%.2f) ' % (ep_label, step,
                                                          score, true_score)
                if type(real_action) == int:
                    act_temp = '[Act] %d' % real_action
                else:
                    act_temp = '[Act]' + (' {:.2f}' * len(real_action)).format(
                        *real_action)
                if type(policy) == tuple:
                    pi_temp = ' [Mu]' + (' {:.2f}' *
                                         len(policy[0])).format(*policy[0])
                    pi_temp += ' [Std]' + (' {:.2f}' *
                                           len(policy[1])).format(*policy[1])
                else:
                    pi_temp = ' [Pi]' + (' {:.2f}' *
                                         len(policy)).format(*policy)

                print(stamp, act_temp, pi_temp, '\t', end='\r', flush=True)
            obs, true_rew, done, info = self.env.step(real_action)
            next_state = self.preprocess_obs(obs, self.env, self.resize,
                                             self.seqlen, state)

            if sparsify:
                next_pos = int(self.env.robot.body_xyz[0])
                if next_pos - pos >= 1:
                    rew = 1.
                    pos = next_pos
                else:
                    rew = 0.
            else:
                rew = true_rew

            step += 1
            score += rew
            true_score += true_rew
            horizon_step += 1
            if type(real_action) == int:
                pmax += np.max(policy)
            else:
                pmax += np.exp(log_pi.item())

            if render:
                self.env.render()

            if not test:
                self.append_horizon(state, action, rew, log_pi)

            state = next_state

            if not test:
                if horizon_step >= self.horizon or done:
                    horizon_step = 0
                    a_loss, c_loss = self.memory_process(next_state, done)
                    if a_loss:
                        a_losses.append(a_loss)
                        c_losses.append(c_loss)
        # done
        if a_losses:
            a_loss = np.mean(a_losses)
            c_loss = np.mean(c_losses)
        else:
            a_loss, c_loss = 0., 0.
        pmax /= step
        stat = {
            'true_score': true_score,
            'score': score,
            'step': step,
            'actor_loss': a_loss,
            'critic_loss': c_loss,
            'pmax': pmax,
        }
        if 'end' in info:
            stat['end'] = info['end']
        if 'score' in info:
            stat['true_score'] = info['score']
        return stat

    def record(self,
               thres,
               path,
               render=False,
               verbose=False,
               delay=0,
               ep_label=0):
        done = False
        score = 0.
        step = 0

        obs = self.env.reset()
        state = self.preprocess_obs(obs, self.env, self.resize, self.seqlen)
        if render:
            self.env.render()

        while not done:
            time.sleep(delay)
            real_action, action, log_pi, _ = self.get_action(state)

            if verbose:
                stamp = '[EP%dT%d] [Rew] %.2f ' % (ep_label, step, score)
                print(stamp, '\t', end='\r', flush=True)
            obs, rew, done, _ = self.env.step(real_action)
            next_state = self.preprocess_obs(obs, self.env, self.resize,
                                             self.seqlen, state)

            step += 1
            score += rew

            if render:
                self.env.render()
            self.append_horizon(state, action, rew, log_pi)
            state = next_state

        # done
        if score >= thres:
            states, actions, _, _ = self.memory.rollout()
            states = np.concatenate(states, axis=0)
            actions = np.concatenate(actions, axis=0)
            timestamp = dt.now().strftime('%H_%M_%S')
            filename = 'T%dS%.2f_%s' % (step, score, timestamp)

            record_path = os.path.join(path, filename)
            while os.path.exists(record_path + '.npz'):
                record_path += '_'

            np.savez_compressed(record_path, state=states, action=actions)
            stamp = '[EP%dT%d] [Rew] %.2f ' % (ep_label, step, score)
            print(stamp, 'saved...', record_path)
            self.memory.flush()
示例#6
0
 def __init__(self):
     self.memory = HorizonMemory()