def __init__(self, **kwargs): self.env = kwargs.get('env') super().__init__(self.env) self.actor_lr = kwargs.get('actor_lr', 1e-4) self.critic_lr = kwargs.get('critic_lr', 2e-4) self.entropy = kwargs.get('entropy', 1e-4) self.actor_units = kwargs.get('actor_units', [128]) self.critic_units = kwargs.get('critic_units', [128]) self.horizon = kwargs.get('horizon', 64) self.update_rate = kwargs.get('update_rate', 2048) self.batch_size = kwargs.get('batch_size', 64) self.epoch = kwargs.get('epoch', 4) self.clip = kwargs.get('clip', 0.2) self.gamma = kwargs.get('gamma', 0.995) self.lambd = kwargs.get('lambd', 0.97) self.resize = kwargs.get('resize', 84) self.seqlen = kwargs.get('seqlen', 1) self.state_shape = [] self.gamlam = self.gamma * self.lambd self.memory = HorizonMemory() self.replay = BatchMemory() self.actor = tf.keras.models.Model() self.critic = tf.keras.models.Model() self.preprocess_obs = (lambda a, b=None, c=None, d=None, e=None: a) self.entropy_func = (lambda a, b: a) self.log_pi_func = (lambda a, b: a)
def __init__(self, vail_sample, reward_shift, reward_aug, gae_norm, global_norm, actor_lr, critic_lr, disc_lr, actor_units, critic_units, disc_units, disc_reduce_units, gamma, lambd, clip, entropy, epochs, batch_size, update_rate, data_dir, demo_list): # build network self.actor = Actor(lr=actor_lr, hidden_units=actor_units) self.critic = Critic(lr=critic_lr, hidden_units=critic_units) self.discriminator = Discriminator(lr=disc_lr, hidden_units=disc_units, reduce_units=disc_reduce_units) self.encoder = VAE_Encoder(latent_num=64) # set hyperparameters self.vail_sample = vail_sample self.reward_shift = reward_shift self.reward_aug = reward_aug self.gae_norm = gae_norm self.gamma = gamma self.lambd = lambd self.gam_lam = gamma * lambd self.clip = clip self.entropy = entropy self.epochs = epochs self.batch_size = batch_size self.half_batch_size = batch_size // 2 self.update_rate = update_rate self.grad_global_norm = global_norm self.beta = BETA_INIT # build memory self.memory = HorizonMemory(use_reward=reward_aug) self.replay = ReplayMemory() # build expert demonstration Pipeline self.data_dir = data_dir self.demo_list = os.listdir(data_dir) self.demo_group_num = 500 self.demo_rotate = 5 assert len(demo_list) >= self.demo_group_num self.set_demo() # ready self.dummy_forward() self.actor_vars = self.actor.trainable_variables + self.encoder.trainable_variables self.critic_vars = self.critic.trainable_variables + self.encoder.trainable_variables self.disc_vars = self.discriminator.trainable_variables + self.encoder.trainable_variables
def __init__(self, reward_shift, actor_units, critic_units, disc_units, disc_reduce_units, code_units): # build network self.actor = Actor(lr=0, hidden_units=actor_units) self.critic = Critic(lr=0, hidden_units=critic_units) self.discriminator = Discriminator( lr=0, hidden_units=disc_units, reduce_units=disc_reduce_units) self.encoder = VAE_Encoder(latent_num=64) self.prior = DiscretePosterior(lr=0, hidden_units=code_units) # set hyperparameters self.reward_shift = reward_shift self.memory = HorizonMemory() # ready self.dummy_forward()
class GAIL: def __init__(self, vail_sample, reward_shift, reward_aug, gae_norm, global_norm, actor_lr, critic_lr, disc_lr, actor_units, critic_units, disc_units, disc_reduce_units, gamma, lambd, clip, entropy, epochs, batch_size, update_rate, data_dir, demo_list): # build network self.actor = Actor(lr=actor_lr, hidden_units=actor_units) self.critic = Critic(lr=critic_lr, hidden_units=critic_units) self.discriminator = Discriminator(lr=disc_lr, hidden_units=disc_units, reduce_units=disc_reduce_units) self.encoder = VAE_Encoder(latent_num=64) # set hyperparameters self.vail_sample = vail_sample self.reward_shift = reward_shift self.reward_aug = reward_aug self.gae_norm = gae_norm self.gamma = gamma self.lambd = lambd self.gam_lam = gamma * lambd self.clip = clip self.entropy = entropy self.epochs = epochs self.batch_size = batch_size self.half_batch_size = batch_size // 2 self.update_rate = update_rate self.grad_global_norm = global_norm self.beta = BETA_INIT # build memory self.memory = HorizonMemory(use_reward=reward_aug) self.replay = ReplayMemory() # build expert demonstration Pipeline self.data_dir = data_dir self.demo_list = os.listdir(data_dir) self.demo_group_num = 500 self.demo_rotate = 5 assert len(demo_list) >= self.demo_group_num self.set_demo() # ready self.dummy_forward() self.actor_vars = self.actor.trainable_variables + self.encoder.trainable_variables self.critic_vars = self.critic.trainable_variables + self.encoder.trainable_variables self.disc_vars = self.discriminator.trainable_variables + self.encoder.trainable_variables def dummy_forward(self): # connect networks dummy_state = np.zeros([1] + STATE_SHAPE, dtype=np.float32) dummy_action = np.zeros([1] + ACTION_SHAPE, dtype=np.float32) self.encoder(dummy_state) self.actor(self.encoder, dummy_state) self.critic(self.encoder, dummy_state) self.discriminator(self.encoder, dummy_state, dummy_action) def set_demo(self): self.demo_list = os.listdir(data_dir) selected_demos = random.sample(self.demo_list, self.demo_group_num) expert_states = [] expert_actions = [] for demo_name in selected_demos: demo = np.load(self.data_dir + demo_name) states = demo['state'] actions = demo['action'] expert_states.append(states) expert_actions.append(actions) self.expert_states = np.concatenate(expert_states, axis=0) self.expert_actions = np.concatenate(expert_actions, axis=0) del demo def get_demonstration(self, sample_num): index = np.arange(len(self.expert_states)) try: assert len(self.expert_states) >= sample_num except Exception: self.set_demo() np.random.shuffle(index) index = index[:sample_num] return self.expert_states[index], self.expert_actions[index] def memory_process(self, next_state, done): # [[(1,64,64,3)], [], ...], [[(1,2),(1,9),(1,3),(1,4)], [], ...], [[c_pi, d_pi, s_pi, a_pi], [], ...] if self.reward_aug: states, actions, log_old_pis, rewards = self.memory.rollout() else: states, actions, log_old_pis = self.memory.rollout() np_states = np.concatenate(states + [next_state], axis=0) np_actions = np.concatenate(actions, axis=0) np_rewards = self.get_reward(np_states[:-1], np_actions) # (N, 1) if self.reward_aug: np_env_rewards = np.stack(rewards, axis=0).reshape(-1, 1) np_rewards = np_rewards + np_env_rewards gae, oracle = self.get_gae_oracle(np_states, np_rewards, done) # (N, 1), (N, 1) self.replay.append(states, actions, log_old_pis, gae, oracle) self.memory.flush() if len(self.replay) >= self.update_rate: self.update() self.replay.flush() def get_action(self, state): policy = self.actor(self.encoder, state).numpy()[0] action = np.random.choice(ACTION_NUM, p=policy) # action = np.argmax(policy) action_one_hot = np.eye(ACTION_NUM, dtype=np.float32)[[action]] # (1, 4) log_old_pi = [[np.log(policy[action] + 1e-8)]] # (1, 1) return action, action_one_hot, log_old_pi, policy def get_reward(self, states, actions): d = self.discriminator(self.encoder, states, actions).numpy() # (N, 1) # rewards = 0.5 - d # linear reward # rewards = np.tan(0.5 - d) # tan reward if self.reward_shift: rewards = -np.log(2.0 * d + 1e-8) # log equil reward else: rewards = -np.log(d + 1e-8) # log reward # rewards = 0.1 * np.where(rewards>1, 1, rewards) return rewards def get_gae_oracle(self, states, rewards, done): # states include next state values = self.critic(self.encoder, states).numpy() # (N+1, 1) if done: values[-1] = np.float32([0]) N = len(rewards) gae = 0 gaes = np.zeros((N, 1), dtype=np.float32) oracles = np.zeros((N, 1), dtype=np.float32) for t in reversed(range(N)): oracles[t] = rewards[t] + self.gamma * values[t + 1] delta = oracles[t] - values[t] gae = delta + self.gam_lam * gae gaes[t][0] = gae # oracles = gaes + values[:-1] # (N, 1) if self.gae_norm: gaes = (gaes - np.mean(gaes)) / (np.std(gaes) + 1e-8) return gaes, oracles def update(self): # load & calculate data states, actions, log_old_pis, gaes, oracles \ = self.replay.rollout() states = np.concatenate(states, axis=0) actions = np.concatenate(actions, axis=0) log_old_pis = np.concatenate(log_old_pis, axis=0) gaes = np.concatenate(gaes, axis=0) oracles = np.concatenate(oracles, axis=0) N = len(states) # update discriminator # load expert demonstration s_e, a_e = self.get_demonstration(N) batch_num = N // self.half_batch_size index = np.arange(N) np.random.shuffle(index) for i in range(batch_num): idx = index[i * self.half_batch_size:(i + 1) * self.half_batch_size] s_concat = np.concatenate([states[idx], s_e[idx]], axis=0) a_concat = np.concatenate([actions[idx], a_e[idx]], axis=0) with tf.GradientTape(persistent=True) as tape: mu, std, sampled = self.discriminator.encode( self.encoder, s_concat, a_concat) discs = self.discriminator.decode( sampled if self.vail_sample else mu) kld_loss = tf.reduce_mean(tf_gaussian_KL(mu, 0, std, 1)) agent_loss = -tf.reduce_mean( tf.math.log(discs[:self.half_batch_size] + 1e-8)) expert_loss = -tf.reduce_mean( tf.math.log(1 + 1e-8 - discs[self.half_batch_size:])) disc_loss = agent_loss + expert_loss discriminator_loss = disc_loss + self.beta * kld_loss disc_grads = tape.gradient(discriminator_loss, self.disc_vars) if self.grad_global_norm > 0: disc_grads, _ = tf.clip_by_global_norm(disc_grads, self.grad_global_norm) self.discriminator.opt.apply_gradients( zip(disc_grads, self.disc_vars)) del tape # TODO: update posterior # L1 loss = logQ(code|s,prev_a,prev_code) # update actor & critic # batch_num = math.ceil(len(states) / self.batch_size) batch_num = len(gaes) // self.batch_size index = np.arange(len(gaes)) for _ in range(self.epochs): np.random.shuffle(index) for i in range(batch_num): # if i == batch_num - 1: # idx = index[i*self.batch_size : ] # else: idx = index[i * self.batch_size:(i + 1) * self.batch_size] state = states[idx] action = actions[idx] log_old_pi = log_old_pis[idx] gae = gaes[idx] oracle = oracles[idx] # update critic with tf.GradientTape(persistent=True) as tape: values = self.critic(self.encoder, state) # (N, 1) critic_loss = tf.reduce_mean( (oracle - values)**2) # MSE loss critic_grads = tape.gradient(critic_loss, self.critic_vars) if self.grad_global_norm > 0: critic_grads, _ = tf.clip_by_global_norm( critic_grads, self.grad_global_norm) self.critic.opt.apply_gradients( zip(critic_grads, self.critic_vars)) del tape # update actor with tf.GradientTape(persistent=True) as tape: pred_action = self.actor(self.encoder, state) # RL (PPO) term log_pi = tf.expand_dims(tf.math.log( tf.reduce_sum(pred_action * action, axis=1) + 1e-8), axis=1) # (N, 1) ratio = tf.exp(log_pi - log_old_pi) clip_ratio = tf.clip_by_value(ratio, 1 - self.clip, 1 + self.clip) clip_loss = -tf.reduce_mean( tf.minimum(ratio * gae, clip_ratio * gae)) entropy = tf.reduce_mean(tf.exp(log_pi) * log_pi) actor_loss = clip_loss + self.entropy * entropy actor_grads = tape.gradient( actor_loss, self.actor_vars) # NOTE: freeze posterior if self.grad_global_norm > 0: actor_grads, _ = tf.clip_by_global_norm( actor_grads, self.grad_global_norm) self.actor.opt.apply_gradients( zip(actor_grads, self.actor_vars)) del tape # print('%d samples trained... D loss: %.4f C loss: %.4f A loss: %.4f\t\t\t' # % (len(gaes), disc_loss, critic_loss, actor_loss), end='\r') def save_model(self, dir, tag=''): self.actor.save_weights(dir + tag + 'actor.h5') self.critic.save_weights(dir + tag + 'critic.h5') self.discriminator.save_weights(dir + tag + 'discriminator.h5') self.encoder.save_weights(dir + tag + 'encoder.h5') def load_model(self, dir, tag=''): if os.path.exists(dir + tag + 'actor.h5'): self.actor.load_weights(dir + tag + 'actor.h5') print('Actor loaded... %s%sactor.h5' % (dir, tag)) if os.path.exists(dir + tag + 'critic.h5'): self.critic.load_weights(dir + tag + 'critic.h5') print('Critic loaded... %s%scritic.h5' % (dir, tag)) if os.path.exists(dir + tag + 'discriminator.h5'): self.discriminator.load_weights(dir + tag + 'discriminator.h5') print('Discriminator loaded... %s%sdiscriminator.h5' % (dir, tag)) if os.path.exists(dir + tag + 'encoder.h5'): self.encoder.load_weights(dir + tag + 'encoder.h5') print('encoder loaded... %s%sencoder.h5' % (dir, tag)) def load_encoder(self, dir, tag=''): if os.path.exists(dir + tag + 'encoder.h5'): self.encoder.load_weights(dir + tag + 'encoder.h5') print('encoder loaded... %s%sencoder.h5' % (dir, tag))
class PPO(Agent): def __init__(self, **kwargs): self.env = kwargs.get('env') super().__init__(self.env) self.actor_lr = kwargs.get('actor_lr', 1e-4) self.critic_lr = kwargs.get('critic_lr', 2e-4) self.entropy = kwargs.get('entropy', 1e-4) self.actor_units = kwargs.get('actor_units', [128]) self.critic_units = kwargs.get('critic_units', [128]) self.horizon = kwargs.get('horizon', 64) self.update_rate = kwargs.get('update_rate', 2048) self.batch_size = kwargs.get('batch_size', 64) self.epoch = kwargs.get('epoch', 4) self.clip = kwargs.get('clip', 0.2) self.gamma = kwargs.get('gamma', 0.995) self.lambd = kwargs.get('lambd', 0.97) self.resize = kwargs.get('resize', 84) self.seqlen = kwargs.get('seqlen', 1) self.state_shape = [] self.gamlam = self.gamma * self.lambd self.memory = HorizonMemory() self.replay = BatchMemory() self.actor = tf.keras.models.Model() self.critic = tf.keras.models.Model() self.preprocess_obs = (lambda a, b=None, c=None, d=None, e=None: a) self.entropy_func = (lambda a, b: a) self.log_pi_func = (lambda a, b: a) def dummy_forward(self): dummy_s = np.zeros([1] + self.state_shape, dtype=np.float32) self.actor(dummy_s) self.critic(dummy_s) def load_model(self, path): actor_path = os.path.join(path, 'actor.h5') critic_path = os.path.join(path, 'critic.h5') std_path = os.path.join(path, 'std.npy') if os.path.exists(actor_path): self.actor.load_weights(actor_path) if os.path.exists(std_path): self.actor.log_std.assign(np.load(std_path)) print('Actor Loaded... ', actor_path) if os.path.exists(critic_path): self.critic.load_weights(critic_path) print('Critic Loaded... ', critic_path) def save_model(self, path): actor_path = os.path.join(path, 'actor.h5') critic_path = os.path.join(path, 'critic.h5') std_path = os.path.join(path, 'std.npy') self.actor.save_weights(actor_path) try: np.save(std_path, self.actor.log_std.numpy()) except: pass self.critic.save_weights(critic_path) def get_action(self, state): raise NotImplementedError def append_horizon(self, state, action, reward, log_pi): self.memory.append(state, action, reward, log_pi) def memory_process(self, next_state, done): a_loss, c_loss = None, None states, actions, log_pis, rewards = self.memory.rollout() gaes, targets = self.get_gae_target(states, rewards, next_state, done) self.replay.append(states, actions, log_pis, gaes, targets) self.memory.flush() if len(self.replay) >= self.update_rate: a_loss, c_loss = self.train() self.replay.flush() return a_loss, c_loss def get_gae_target(self, states, rewards, next_state, done): states = np.concatenate(states + [next_state], axis=0) values = self.critic(states).numpy() gaes = np.zeros_like(rewards, dtype=np.float32).reshape(-1, 1) targets = np.zeros_like(gaes) gae = 0 if done: values[-1][0] = 0. for t in reversed(range(len(gaes))): targets[t] = rewards[t] + self.gamma * values[t + 1] delta = targets[t] - values[t] gaes[t] = delta + self.gamlam * gae targets = values[:-1] + gaes gaes = (gaes - np.mean(gaes)) / (np.std(gaes) + 1e-8) return gaes.tolist(), targets.tolist() def train(self): states, actions, log_pis, gaes, targets \ = self.replay.rollout() states = np.concatenate(states, axis=0) actions = np.concatenate(actions, axis=0) log_pis = np.concatenate(log_pis, axis=0).reshape(-1, 1) gaes = np.concatenate(gaes, axis=0).reshape(-1, 1) targets = np.concatenate(targets, axis=0).reshape(-1, 1) actor_losses, critic_losses = 0., 0. idx = np.arange(len(states)) batch_num = len(states) // self.batch_size for _ in range(self.epoch): np.random.shuffle(idx) for i in range(batch_num): s_b = states[i * self.batch_size:(i + 1) * self.batch_size] # (N, S...) a_b = actions[i * self.batch_size:(i + 1) * self.batch_size] # (N, A) l_b = log_pis[i * self.batch_size:(i + 1) * self.batch_size] # (N, A) g_b = gaes[i * self.batch_size:(i + 1) * self.batch_size] # (N, 1) t_b = targets[i * self.batch_size:(i + 1) * self.batch_size] # (N, 1) # update critic with tf.GradientTape() as tape: values = self.critic(s_b) critic_loss = tf.reduce_mean((values - t_b)**2) critic_grads = tape.gradient(critic_loss, self.critic.trainable_variables) # update actor with tf.GradientTape() as tape: pred = self.actor(s_b) log_pi = self.log_pi_func(a_b, pred) ratio = tf.exp(log_pi - l_b) clipped = tf.clip_by_value(ratio, 1 - self.clip, 1 + self.clip) clip_loss = -tf.reduce_mean( tf.minimum(ratio * g_b, clipped * g_b)) entropy = self.entropy_func(pred, log_pi) actor_loss = clip_loss - self.entropy * entropy actor_grads = tape.gradient(actor_loss, self.actor.trainable_variables) self.critic.opt.apply_gradients( zip(critic_grads, self.critic.trainable_variables)) self.actor.opt.apply_gradients( zip(actor_grads, self.actor.trainable_variables)) actor_losses += actor_loss.numpy() critic_losses += critic_loss.numpy() train_num = batch_num * self.epoch return actor_losses / train_num, critic_losses / train_num def play(self, render=False, verbose=False, delay=0, ep_label=0, test=False, sparsify=True): done = False score, true_score = 0., 0. step = 0 horizon_step = 0 a_losses, c_losses, = [], [] pmax = 0 obs = self.env.reset() if sparsify: pos = int(self.env.robot.body_xyz[0]) state = self.preprocess_obs(obs, self.env, self.resize, self.seqlen) if render: self.env.render() while not done: time.sleep(delay) real_action, action, log_pi, policy = self.get_action(state) if verbose: stamp = '[EP%dT%d] [Rew] %.2f (%.2f) ' % (ep_label, step, score, true_score) if type(real_action) == int: act_temp = '[Act] %d' % real_action else: act_temp = '[Act]' + (' {:.2f}' * len(real_action)).format( *real_action) if type(policy) == tuple: pi_temp = ' [Mu]' + (' {:.2f}' * len(policy[0])).format(*policy[0]) pi_temp += ' [Std]' + (' {:.2f}' * len(policy[1])).format(*policy[1]) else: pi_temp = ' [Pi]' + (' {:.2f}' * len(policy)).format(*policy) print(stamp, act_temp, pi_temp, '\t', end='\r', flush=True) obs, true_rew, done, info = self.env.step(real_action) next_state = self.preprocess_obs(obs, self.env, self.resize, self.seqlen, state) if sparsify: next_pos = int(self.env.robot.body_xyz[0]) if next_pos - pos >= 1: rew = 1. pos = next_pos else: rew = 0. else: rew = true_rew step += 1 score += rew true_score += true_rew horizon_step += 1 if type(real_action) == int: pmax += np.max(policy) else: pmax += np.exp(log_pi.item()) if render: self.env.render() if not test: self.append_horizon(state, action, rew, log_pi) state = next_state if not test: if horizon_step >= self.horizon or done: horizon_step = 0 a_loss, c_loss = self.memory_process(next_state, done) if a_loss: a_losses.append(a_loss) c_losses.append(c_loss) # done if a_losses: a_loss = np.mean(a_losses) c_loss = np.mean(c_losses) else: a_loss, c_loss = 0., 0. pmax /= step stat = { 'true_score': true_score, 'score': score, 'step': step, 'actor_loss': a_loss, 'critic_loss': c_loss, 'pmax': pmax, } if 'end' in info: stat['end'] = info['end'] if 'score' in info: stat['true_score'] = info['score'] return stat def record(self, thres, path, render=False, verbose=False, delay=0, ep_label=0): done = False score = 0. step = 0 obs = self.env.reset() state = self.preprocess_obs(obs, self.env, self.resize, self.seqlen) if render: self.env.render() while not done: time.sleep(delay) real_action, action, log_pi, _ = self.get_action(state) if verbose: stamp = '[EP%dT%d] [Rew] %.2f ' % (ep_label, step, score) print(stamp, '\t', end='\r', flush=True) obs, rew, done, _ = self.env.step(real_action) next_state = self.preprocess_obs(obs, self.env, self.resize, self.seqlen, state) step += 1 score += rew if render: self.env.render() self.append_horizon(state, action, rew, log_pi) state = next_state # done if score >= thres: states, actions, _, _ = self.memory.rollout() states = np.concatenate(states, axis=0) actions = np.concatenate(actions, axis=0) timestamp = dt.now().strftime('%H_%M_%S') filename = 'T%dS%.2f_%s' % (step, score, timestamp) record_path = os.path.join(path, filename) while os.path.exists(record_path + '.npz'): record_path += '_' np.savez_compressed(record_path, state=states, action=actions) stamp = '[EP%dT%d] [Rew] %.2f ' % (ep_label, step, score) print(stamp, 'saved...', record_path) self.memory.flush()
def __init__(self): self.memory = HorizonMemory()