def __init__(self, args, env_params): self.o_dim = env_params['o_dim'] self.a_dim = env_params['a_dim'] self.action_boundary = env_params['action_boundary'] self.lr_a = args.lr_a self.lr_c = args.lr_c self.gamma = args.gamma self.tau = args.tau self.noise_eps = args.noise_eps self.batch_size = args.batch_size self.device = torch.device(args.device) self.actor = DeterministicPolicy(self.o_dim, self.a_dim).to(self.device) self.actor_tar = DeterministicPolicy(self.o_dim, self.a_dim).to(self.device) self.critic = QFunction(self.o_dim, self.a_dim).to(self.device) self.critic_tar = QFunction(self.o_dim, self.a_dim).to(self.device) self.optimizer_a = optim.Adam(self.actor.parameters(), lr=self.lr_a) self.optimizer_c = optim.Adam(self.critic.parameters(), lr=self.lr_c) self.hard_update()
def __init__(self, s_dim, a_dim, action_space, args): self.s_dim = s_dim self.a_dim = a_dim self.action_space = action_space self.lr_pi = args.lr_pi self.lr_q = args.lr_q self.gamma = args.gamma self.tau = args.tau self.noise_std = args.noise_std self.noise_clip = args.noise_clip self.batch_size = args.batch_size self.policy_update_interval = args.policy_update_interval self.device = torch.device(args.device) self.policy_loss_log = torch.tensor(0.).to(self.device) self.policy = DeterministicPolicy(self.s_dim, self.a_dim, self.device, action_space=self.action_space).to( self.device) self.policy_target = DeterministicPolicy( self.s_dim, self.a_dim, self.device, action_space=self.action_space).to(self.device) self.Q1 = QFunction(self.s_dim, self.a_dim).to(self.device) self.Q1_target = QFunction(self.s_dim, self.a_dim).to(self.device) self.Q2 = QFunction(self.s_dim, self.a_dim).to(self.device) self.Q2_target = QFunction(self.s_dim, self.a_dim).to(self.device) self.hard_update_target() self.optimizer_pi = optim.Adam(self.policy.parameters(), lr=self.lr_pi) self.optimizer_q1 = optim.Adam(self.Q1.parameters(), lr=self.lr_q) self.optimizer_q2 = optim.Adam(self.Q2.parameters(), lr=self.lr_q)
def __init__(self, input_shape, action_n, gamma=0.99, N=50000): self.shape = input_shape self.batch_size = input_shape[0] self.N = N Q = QFunction(input_shape, action_n, scope="Q") target_Q = QFunction(input_shape, action_n, scope="target_Q") # Forward Q self.s = tf.placeholder(shape=[None] + input_shape[1:], dtype=tf.float32) self.a = tf.placeholder(shape=[self.batch_size, 1], dtype=tf.int32) self.probs = Q(self.s, s_bias=False) # add offset first = tf.expand_dims(tf.range(self.batch_size), axis=1) indices = tf.concat(values=[first, self.a], axis=1) # gather corresiponding q_vals self.q_val = tf.expand_dims(tf.gather_nd(self.probs, indices), axis=1) # TD target self.done = tf.placeholder(shape=[self.batch_size, 1], dtype=tf.float32) self.r = tf.placeholder(shape=[self.batch_size, 1], dtype=tf.float32) self.s_ = tf.placeholder(shape=input_shape, dtype=tf.float32) # D-DQN a_max = tf.expand_dims(tf.argmax(Q(self.s_, reuse=True), axis=1), axis=1) a_max = tf.to_int32(a_max) target_q_val = tf.expand_dims(tf.gather_nd( target_Q(self.s_), tf.concat(values=[first, a_max], axis=1)), axis=1) self.y = self.r + gamma * (1.0 - self.done) * target_q_val # Error Clipping self.loss = tf.reduce_mean(Hurber_loss(self.q_val, self.y)) # Update Q opt = tf.train.RMSPropOptimizer(0.00025, 0.99, 0.0, 1e-6) grads_and_vars = opt.compute_gradients(self.loss) grads_and_vars = [[grad, var] for grad, var in grads_and_vars \ if grad is not None and (var.name.startswith("Q") or var.name.startswith("shared"))] self.train_op = opt.apply_gradients(grads_and_vars) # Update target Q self.target_train_op = copy_params(Q, target_Q) # replay buffer self.D = []
def __init__(self, action_n): # create Q networks Q = QFunction(action_n, scope="Q") target_Q = QFunction(action_n, scope="target_Q") # define placeholders self.s = tf.placeholder( shape=[None, cfg.height, cfg.width, cfg.state_length], dtype=tf.float32) self.a = tf.placeholder(shape=[cfg.batch_size, 1], dtype=tf.int32) self.r = tf.placeholder(shape=[cfg.batch_size, 1], dtype=tf.float32) self.done = tf.placeholder(shape=[cfg.batch_size, 1], dtype=tf.float32) self.next_s = tf.placeholder( shape=[cfg.batch_size, cfg.height, cfg.width, cfg.state_length], dtype=tf.float32) # predict Q values self.probs = Q(self.s) # add offset first = tf.expand_dims(tf.range(cfg.batch_size), axis=1) # choose Q value q_val = tf.expand_dims(tf.gather_nd(self.probs, tf.concat([first, self.a], axis=1)), axis=1) # create teacher a_max = tf.expand_dims(tf.argmax(Q(self.next_s, reuse=True), axis=1, output_type=tf.int32), axis=1) target_q_val = tf.expand_dims(tf.gather_nd( target_Q(self.next_s), tf.concat([first, a_max], axis=1)), axis=1) y = self.r + cfg.gamma * (1.0 - self.done) * target_q_val # calculate loss self.loss = huber_loss(y, q_val) # update Q opt = tf.train.AdamOptimizer(cfg.eta) self.train_op = opt.minimize(self.loss) # update target Q self.target_train_op = copy_params(Q, target_Q)
def __init__(self, args, env_params): self.o_dim = env_params['o_dim'] self.a_dim = env_params['a_dim'] self.g_dim = env_params['g_dim'] self.action_bound = env_params['action_max'] self.lr = args.lr self.l2_coefficient = args.l2_coefficient self.gamma = args.gamma self.batch_size = args.batch_size self.device = torch.device(args.device) self.tau = args.tau self.noise_eps = args.noise_eps self.policy = Policy(o_dim=self.o_dim, a_dim=self.a_dim, g_dim=self.g_dim).to(self.device) self.policy_target = Policy(o_dim=self.o_dim, a_dim=self.a_dim, g_dim=self.g_dim).to(self.device) self.Q = QFunction(o_dim=self.o_dim, a_dim=self.a_dim, g_dim=self.g_dim).to(self.device) self.Q_target = QFunction(o_dim=self.o_dim, a_dim=self.a_dim, g_dim=self.g_dim).to(self.device) sync_networks(self.policy) sync_networks(self.Q) self.optimizer_p = optim.Adam(self.policy.parameters(), lr=self.lr) self.optimizer_q = optim.Adam(self.Q.parameters(), lr=self.lr) self.normalizer_o = Normalizer(size=self.o_dim, eps=1e-2, clip_range=1.) self.normalizer_g = Normalizer(size=self.g_dim, eps=1e-2, clip_range=1.) self.hard_update()
def __init__(self, args, env_params): self.o_dim = env_params['o_dim'] self.a_dim = env_params['a_dim'] self.g_dim = env_params['g_dim'] self.action_boundary = env_params['action_boundary'] self.max_episode_steps = env_params['max_episode_steps'] self.evaluate_episodes = args.evaluate_episodes self.lr_pi = args.lr_pi_TD3 self.lr_q = args.lr_q self.gamma = args.gamma self.tau = args.tau self.action_var = args.action_var self.noise_std = args.noise_std self.noise_clip = args.noise_clip self.K_updates = args.K_updates_TD3 self.policy_update_interval = args.policy_update_interval self.batch_size = args.batch_size self.device = torch.device(args.device) self.load_model_remark = args.load_model_remark self.total_trained_goal_num = 0 self.total_episode_num = 0 self.total_update_num = 0 self.policy_loss_log = 0. self.q1_loss_log = 0. self.q2_loss_log = 0. self.memory = MemoryBuffer(args.memory_capacity, self.o_dim, self.g_dim, self.a_dim) self.policy = GaussianPolicy(self.o_dim, self.g_dim, self.a_dim, self.action_var, self.device).to(self.device) self.policy_target = GaussianPolicy(self.o_dim, self.g_dim, self.a_dim, self.action_var, self.device).to(self.device) self.Q1 = QFunction(self.o_dim, self.g_dim, self.a_dim).to(self.device) self.Q1_target = QFunction(self.o_dim, self.g_dim, self.a_dim).to(self.device) self.Q2 = QFunction(self.o_dim, self.g_dim, self.a_dim).to(self.device) self.Q2_target = QFunction(self.o_dim, self.g_dim, self.a_dim).to(self.device) self.optimizer_pi = optim.Adam(self.policy.parameters(), lr=self.lr_pi) self.optimizer_q1 = optim.Adam(self.Q1.parameters(), lr=self.lr_q) self.optimizer_q2 = optim.Adam(self.Q2.parameters(), lr=self.lr_q) self.hard_update()
class DDPGAgent(): def __init__(self, args, env_params): self.o_dim = env_params['o_dim'] self.a_dim = env_params['a_dim'] self.action_boundary = env_params['action_boundary'] self.lr_a = args.lr_a self.lr_c = args.lr_c self.gamma = args.gamma self.tau = args.tau self.noise_eps = args.noise_eps self.batch_size = args.batch_size self.device = torch.device(args.device) self.actor = DeterministicPolicy(self.o_dim, self.a_dim).to(self.device) self.actor_tar = DeterministicPolicy(self.o_dim, self.a_dim).to(self.device) self.critic = QFunction(self.o_dim, self.a_dim).to(self.device) self.critic_tar = QFunction(self.o_dim, self.a_dim).to(self.device) self.optimizer_a = optim.Adam(self.actor.parameters(), lr=self.lr_a) self.optimizer_c = optim.Adam(self.critic.parameters(), lr=self.lr_c) self.hard_update() def hard_update(self): self.actor_tar.load_state_dict(self.actor.state_dict()) self.critic_tar.load_state_dict(self.critic.state_dict()) def soft_update(self): for params, params_tar in zip(self.actor.parameters(), self.actor_tar.parameters()): params_tar.data.copy_(self.tau * params.data + (1 - self.tau) * params_tar.data) for params, params_tar in zip(self.critic.parameters(), self.critic_tar.parameters()): params_tar.data.copy_(self.tau * params.data + (1 - self.tau) * params_tar.data) def choose_action(self, obs, is_evaluete=False): obs = torch.from_numpy(obs).float().to(self.device) with torch.no_grad(): action = self.actor(obs) if not is_evaluete: action += torch.normal(torch.tensor(0.), torch.tensor(self.noise_eps)) action = torch.clamp(action, -self.action_boundary, self.action_boundary).cpu().detach().numpy() return action def rollout(self, env, memory, is_evaluate=False): total_reward = 0. obs = env.reset() done = False while not done: a = self.choose_action(obs, is_evaluate) obs_, r, done, info = env.step(a) memory.store(obs, a, r, obs_, done) total_reward += r obs = obs_ return total_reward def update(self, memory): obs, a, r, obs_, done = memory.sample_batch(self.batch_size) obs = torch.from_numpy(obs).float().to(self.device) a = torch.from_numpy(a).float().to(self.device) r = torch.from_numpy(r).float().to(self.device) obs_ = torch.from_numpy(obs_).float().to(self.device) done = torch.from_numpy(done).float().to(self.device) with torch.no_grad(): next_action_tar = self.actor_tar(obs_) next_q_tar = self.critic_tar(obs_, next_action_tar) critic_target = r + (1 - done) * self.gamma * next_q_tar critic_eval = self.critic(obs, a) loss_critic = F.mse_loss(critic_eval, critic_target.detach()) self.optimizer_c.zero_grad() loss_critic.backward() self.optimizer_c.step() loss_actor = -self.critic(obs, self.actor(obs)).mean() self.optimizer_a.zero_grad() loss_actor.backward() self.optimizer_a.step() self.soft_update() def save_model(self, remark): if not os.path.exists('pretrained_model/'): os.mkdir('pretrained_model/') path = 'pretrained_model/{}.pt'.format(remark) print('Saving model to {}'.format(path)) torch.save(self.actor.state_dict(), path) def load_model(self, remark): path = 'pretrained_model/{}.pt'.format(remark) print('Loading model from {}'.format(path)) model = torch.load(path) self.actor.load_state_dict(model)
class TD3Agent(): def __init__(self, s_dim, a_dim, action_space, args): self.s_dim = s_dim self.a_dim = a_dim self.action_space = action_space self.lr_pi = args.lr_pi self.lr_q = args.lr_q self.gamma = args.gamma self.tau = args.tau self.noise_std = args.noise_std self.noise_clip = args.noise_clip self.batch_size = args.batch_size self.policy_update_interval = args.policy_update_interval self.device = torch.device(args.device) self.policy_loss_log = torch.tensor(0.).to(self.device) self.policy = DeterministicPolicy(self.s_dim, self.a_dim, self.device, action_space=self.action_space).to( self.device) self.policy_target = DeterministicPolicy( self.s_dim, self.a_dim, self.device, action_space=self.action_space).to(self.device) self.Q1 = QFunction(self.s_dim, self.a_dim).to(self.device) self.Q1_target = QFunction(self.s_dim, self.a_dim).to(self.device) self.Q2 = QFunction(self.s_dim, self.a_dim).to(self.device) self.Q2_target = QFunction(self.s_dim, self.a_dim).to(self.device) self.hard_update_target() self.optimizer_pi = optim.Adam(self.policy.parameters(), lr=self.lr_pi) self.optimizer_q1 = optim.Adam(self.Q1.parameters(), lr=self.lr_q) self.optimizer_q2 = optim.Adam(self.Q2.parameters(), lr=self.lr_q) def hard_update_target(self): self.policy_target.load_state_dict(self.policy.state_dict()) self.Q1_target.load_state_dict(self.Q1.state_dict()) self.Q2_target.load_state_dict(self.Q2.state_dict()) def soft_update_target(self): for param, param_target in zip(self.policy.parameters(), self.policy_target.parameters()): param_target.data.copy_(param.data * self.tau + param_target.data * (1 - self.tau)) for param, param_target in zip(self.Q1.parameters(), self.Q1_target.parameters()): param_target.data.copy_(param.data * self.tau + param_target.data * (1 - self.tau)) for param, param_target in zip(self.Q2.parameters(), self.Q2_target.parameters()): param_target.data.copy_(param.data * self.tau + param_target.data * (1 - self.tau)) def choose_action(self, s): s = torch.from_numpy(s).to(self.device).float() return self.policy.sample(s).cpu().detach().numpy() def learn(self, memory, total_step): s, a, r, s_, done = memory.sample_batch(self.batch_size) s = torch.from_numpy(s).to(self.device) a = torch.from_numpy(a).to(self.device) r = torch.from_numpy(r).to(self.device).unsqueeze(dim=1) s_ = torch.from_numpy(s_).to(self.device) done = torch.from_numpy(done).to(self.device).unsqueeze(dim=1) noise = (torch.randn_like(a) * self.noise_std).clamp( -self.noise_clip, self.noise_clip) a_target_next = self.policy_target.sample(s_) + noise q1_next = self.Q1_target(s_, a_target_next) q2_next = self.Q2_target(s_, a_target_next) q_next_min = torch.min(q1_next, q2_next) q_loss_target = r + (1 - done) * self.gamma * q_next_min #update q1 q1_loss_pred = self.Q1(s, a) q1_loss = F.mse_loss(q1_loss_pred, q_loss_target.detach()).mean() self.optimizer_q1.zero_grad() q1_loss.backward() self.optimizer_q1.step() #update q2 q2_loss_pred = self.Q2(s, a) q2_loss = F.mse_loss(q2_loss_pred, q_loss_target.detach()).mean() self.optimizer_q2.zero_grad() q2_loss.backward() self.optimizer_q2.step() #delay upodate policy if total_step % self.policy_update_interval == 0: policy_loss = -self.Q1(s, self.policy.sample(s)).mean() self.optimizer_pi.zero_grad() policy_loss.backward() self.optimizer_pi.step() self.soft_update_target() self.policy_loss_log = policy_loss return q1_loss.item(), q2_loss.item(), self.policy_loss_log.item() def save_model(self, env_name, remarks='', pi_path=None, q1_path=None, q2_path=None): if not os.path.exists('pretrained_models/'): os.mkdir('pretrained_models/') if pi_path == None: pi_path = 'pretrained_models/policy_{}_{}'.format( env_name, remarks) if q1_path == None: q1_path = 'pretrained_models/q1_{}_{}'.format(env_name, remarks) if q2_path == None: q2_path = 'pretrained_models/q2_{}_{}'.format(env_name, remarks) print('Saving model to {} , {} and {}'.format(pi_path, q1_path, q2_path)) torch.save(self.policy.state_dict(), pi_path) torch.save(self.Q1.state_dict(), q1_path) torch.save(self.Q2.state_dict(), q2_path) def load_model(self, pi_path, q1_path, q2_path): print('Loading models from {} , {} and {}'.format( pi_path, q1_path, q2_path)) self.policy.load_state_dict(torch.load(pi_path)) self.Q1.load_state_dict(torch.load(q1_path)) self.Q2.load_state_dict(torch.load(q2_path))
class TD3Agent(): def __init__(self, args, env_params): self.o_dim = env_params['o_dim'] self.a_dim = env_params['a_dim'] self.g_dim = env_params['g_dim'] self.action_boundary = env_params['action_boundary'] self.max_episode_steps = env_params['max_episode_steps'] self.evaluate_episodes = args.evaluate_episodes self.lr_pi = args.lr_pi_TD3 self.lr_q = args.lr_q self.gamma = args.gamma self.tau = args.tau self.action_var = args.action_var self.noise_std = args.noise_std self.noise_clip = args.noise_clip self.K_updates = args.K_updates_TD3 self.policy_update_interval = args.policy_update_interval self.batch_size = args.batch_size self.device = torch.device(args.device) self.load_model_remark = args.load_model_remark self.total_trained_goal_num = 0 self.total_episode_num = 0 self.total_update_num = 0 self.policy_loss_log = 0. self.q1_loss_log = 0. self.q2_loss_log = 0. self.memory = MemoryBuffer(args.memory_capacity, self.o_dim, self.g_dim, self.a_dim) self.policy = GaussianPolicy(self.o_dim, self.g_dim, self.a_dim, self.action_var, self.device).to(self.device) self.policy_target = GaussianPolicy(self.o_dim, self.g_dim, self.a_dim, self.action_var, self.device).to(self.device) self.Q1 = QFunction(self.o_dim, self.g_dim, self.a_dim).to(self.device) self.Q1_target = QFunction(self.o_dim, self.g_dim, self.a_dim).to(self.device) self.Q2 = QFunction(self.o_dim, self.g_dim, self.a_dim).to(self.device) self.Q2_target = QFunction(self.o_dim, self.g_dim, self.a_dim).to(self.device) self.optimizer_pi = optim.Adam(self.policy.parameters(), lr=self.lr_pi) self.optimizer_q1 = optim.Adam(self.Q1.parameters(), lr=self.lr_q) self.optimizer_q2 = optim.Adam(self.Q2.parameters(), lr=self.lr_q) self.hard_update() def hard_update(self): self.policy_target.load_state_dict(self.policy.state_dict()) self.Q1_target.load_state_dict(self.Q1.state_dict()) self.Q2_target.load_state_dict(self.Q2.state_dict()) def soft_update(self): for param, param_target in zip(self.policy.parameters(), self.policy_target.parameters()): param_target.data.copy_(param.data * self.tau + param_target.data * (1 - self.tau)) for param, param_target in zip(self.Q1.parameters(), self.Q1_target.parameters()): param_target.data.copy_(param.data * self.tau + param_target.data * (1 - self.tau)) for param, param_target in zip(self.Q2.parameters(), self.Q2_target.parameters()): param_target.data.copy_(param.data * self.tau + param_target.data * (1 - self.tau)) def select_action(self, observation, goal): observation = torch.from_numpy(observation).float().to(self.device) goal = torch.from_numpy(goal).float().to(self.device) input_tensor = torch.cat([observation, goal], dim=0) with torch.no_grad(): dist = self.policy(input_tensor) action = dist.sample() action = action.cpu().detach().numpy() action = np.clip(action, -self.action_boundary, self.action_boundary) return action def train_and_evaluate(self, goals, env, logger = None): returns = np.zeros(shape=[len(goals)], dtype=np.float32) for i_goal, goal in enumerate(goals): success_count = 0 cumulative_r = 0. #for log used_steps = 0 cumulative_loss_pi, cumulative_loss_q1, cumulative_loss_q2 = 0., 0., 0. print('--{} goal: ({:.4f}, {:.4f}):-----------------------'.format(self.total_trained_goal_num, goal[0], goal[1])) for i_episode in range(self.evaluate_episodes): success_flag = 0 _ = env.reset() obs = env.set_goal(goal) for i_step in range(self.max_episode_steps): a = self.select_action(obs['observation'], obs['desired_goal']) obs_, reward, done, info = env.step(a) self.memory.store(obs['observation'], a, reward, obs_['observation'], obs['desired_goal']) cumulative_r += reward used_steps += 1 if success_flag == 0 and info['is_success'] == 1: success_flag = 1 break obs = obs_ if len(self.memory) > self.batch_size: loss_q1, loss_q2, loss_pi = self.update() #need change cumulative_loss_pi += loss_pi cumulative_loss_q1 += loss_q1 cumulative_loss_q2 += loss_q2 success_count += success_flag average_success = success_count / self.evaluate_episodes returns[i_goal] = average_success self.total_trained_goal_num += 1 self.total_episode_num += self.evaluate_episodes if logger is not None: logger.add_scalar('Indicator/reward_per_step', cumulative_r/used_steps, self.total_trained_goal_num) logger.add_scalar('Indicator/goal_success_rate', average_success, self.total_trained_goal_num) logger.add_scalar('loss/loss_pi', cumulative_loss_pi/self.evaluate_episodes, self.total_update_num) logger.add_scalar('loss/loss_q1', cumulative_loss_q1/self.evaluate_episodes, self.total_update_num) logger.add_scalar('loss/loss_q2', cumulative_loss_q2/self.evaluate_episodes, self.total_update_num) print('\t success_rate: {:.2f}'.format(average_success)) print('\t average_episode_return: {:.4f}'.format(cumulative_r/self.evaluate_episodes)) return returns def evaluate_goal(self, goals, env): returns = np.zeros(shape=[len(goals)], dtype=np.float32) for i_goal, goal in enumerate(goals): success_count = 0 for i_episode in range(self.evaluate_episodes): _ = env.reset() obs = env.set_goal(goal) for i_step in range(self.max_episode_steps): a = self.select_action(obs['observation'], obs['desired_goal']) obs_, reward, done, info = env.step(a) if info['is_success'] == 1: success_count += 1 break obs = obs_ average_success = success_count / self.evaluate_episodes print('{} goal: {} {} return: {}'.format(i_goal, goal[0], goal[1], average_success)) returns[i_goal] = average_success return returns def update(self): for i in range(self.K_updates): o, a, r, o_, g = self.memory.sample(self.batch_size) o = torch.from_numpy(o).to(self.device) a = torch.from_numpy(a).to(self.device) r = torch.from_numpy(r).to(self.device).unsqueeze(dim=1) o_ = torch.from_numpy(o_).to(self.device) g = torch.from_numpy(g).to(self.device) o_g_input = torch.cat([o, g], dim=1) next_o_g_input = torch.cat([o_, g], dim=1) o_g_a_input = torch.cat([o, g, a], dim=1) noise = (torch.randn_like(a) * self.noise_std).clamp(-self.noise_clip, self.noise_clip) a_target_next = self.policy_target(next_o_g_input).sample() + noise next_o_a_target_g_input = torch.cat([o_, g, a_target_next], dim=1) q1_next = self.Q1_target(next_o_a_target_g_input) q2_next = self.Q2_target(next_o_a_target_g_input) q_next_min = torch.min(q1_next, q2_next) q_loss_tar = r + self.gamma * q_next_min q1_loss_pred = self.Q1(o_g_a_input) q1_loss = F.mse_loss(q1_loss_pred, q_loss_tar.detach()) self.optimizer_q1.zero_grad() q1_loss.backward() self.optimizer_q1.step() q2_loss_pred = self.Q2(o_g_a_input) q2_loss = F.mse_loss(q2_loss_pred, q_loss_tar.detach()) self.optimizer_q2.zero_grad() q2_loss.backward() self.optimizer_q2.step() self.total_update_num += 1 self.q1_loss_log = q1_loss.cpu().detach().numpy() self.q2_loss_log = q2_loss.cpu().detach().numpy() if self.total_update_num % self.policy_update_interval == 0: actions = self.policy(o_g_input).sample() policy_loss = - self.Q1(torch.cat([o_g_input, actions], dim=1)).mean() self.optimizer_pi.zero_grad() policy_loss.backward() self.optimizer_pi.step() self.policy_loss_log = policy_loss.cpu().detach().numpy() self.soft_update() return self.q1_loss_log, self.q2_loss_log, self.policy_loss_log def save_model(self, remark): if not os.path.exists('pretrained_models_TD3/'): os.mkdir('pretrained_models_TD3/') path = 'pretrained_models_TD3/{}.pt'.format(remark) print('Saving model to {}'.format(path)) torch.save(self.policy.state_dict(), path) def load_model(self): print('Loading models with remark {}'.format(self.load_model_remark)) policy_model = torch.load('pretrained_models_TD3/{}.pt'.format(self.load_model_remark), map_location=lambda storage, loc: storage) self.policy.load_state_dict(policy_model)
def __init__(self, args, env_params): self.s_dim = env_params['o_dim'] + env_params['g_dim'] self.a_dim = env_params['a_dim'] self.f_dim = args.f_dim self.action_bound = env_params['action_max'] self.max_timestep = env_params['max_timestep'] self.max_episode = args.max_episode self.evaluate_episode = args.evaluate_episode self.evaluate_interval = args.evaluate_interval self.log_interval = args.log_interval self.save_model_interval = args.save_model_interval self.save_model_start = args.save_model_start self.lr = args.lr self.lr_model = args.lr_model self.gamma = args.gamma self.batch_size = args.batch_size self.tau = args.tau self.eta = args.eta self.noise_eps = args.noise_eps self.device = torch.device(args.device) self.normalizer_s = Normalizer(size=self.s_dim, eps=1e-2, clip_range=1.) self.memory = Memory(size=args.memory_size, s_dim=self.s_dim, a_dim=self.a_dim) self.policy = Policy(s_dim=self.s_dim, a_dim=self.a_dim).to(self.device) self.policy_target = Policy(s_dim=self.s_dim, a_dim=self.a_dim).to(self.device) self.Q = QFunction(s_dim=self.s_dim, a_dim=self.a_dim).to(self.device) self.Q_target = QFunction(s_dim=self.s_dim, a_dim=self.a_dim).to(self.device) self.optimizer_p = optim.Adam(self.policy.parameters(), lr=self.lr) self.optimizer_q = optim.Adam(self.Q.parameters(), lr=self.lr) self.encoder = StateEncoder(s_dim=self.s_dim, f_dim=self.f_dim).to(self.device) self.EnvForward = ForwardModel(f_dim=self.f_dim, a_dim=self.a_dim).to(self.device) self.EnvInverse = InverseModel(f_dim=self.f_dim, a_dim=self.a_dim).to(self.device) self.optimizer_forward = optim.Adam( [{ 'params': self.EnvForward.parameters() }, { 'params': self.encoder.parameters() }], lr=self.lr_model) self.optimizer_inverse = optim.Adam( [{ 'params': self.EnvInverse.parameters() }, { 'params': self.encoder.parameters() }], lr=self.lr_model) self.hard_update() self.update_num = 0
class DDPG_Agent(): def __init__(self, args, env_params): self.s_dim = env_params['o_dim'] + env_params['g_dim'] self.a_dim = env_params['a_dim'] self.f_dim = args.f_dim self.action_bound = env_params['action_max'] self.max_timestep = env_params['max_timestep'] self.max_episode = args.max_episode self.evaluate_episode = args.evaluate_episode self.evaluate_interval = args.evaluate_interval self.log_interval = args.log_interval self.save_model_interval = args.save_model_interval self.save_model_start = args.save_model_start self.lr = args.lr self.lr_model = args.lr_model self.gamma = args.gamma self.batch_size = args.batch_size self.tau = args.tau self.eta = args.eta self.noise_eps = args.noise_eps self.device = torch.device(args.device) self.normalizer_s = Normalizer(size=self.s_dim, eps=1e-2, clip_range=1.) self.memory = Memory(size=args.memory_size, s_dim=self.s_dim, a_dim=self.a_dim) self.policy = Policy(s_dim=self.s_dim, a_dim=self.a_dim).to(self.device) self.policy_target = Policy(s_dim=self.s_dim, a_dim=self.a_dim).to(self.device) self.Q = QFunction(s_dim=self.s_dim, a_dim=self.a_dim).to(self.device) self.Q_target = QFunction(s_dim=self.s_dim, a_dim=self.a_dim).to(self.device) self.optimizer_p = optim.Adam(self.policy.parameters(), lr=self.lr) self.optimizer_q = optim.Adam(self.Q.parameters(), lr=self.lr) self.encoder = StateEncoder(s_dim=self.s_dim, f_dim=self.f_dim).to(self.device) self.EnvForward = ForwardModel(f_dim=self.f_dim, a_dim=self.a_dim).to(self.device) self.EnvInverse = InverseModel(f_dim=self.f_dim, a_dim=self.a_dim).to(self.device) self.optimizer_forward = optim.Adam( [{ 'params': self.EnvForward.parameters() }, { 'params': self.encoder.parameters() }], lr=self.lr_model) self.optimizer_inverse = optim.Adam( [{ 'params': self.EnvInverse.parameters() }, { 'params': self.encoder.parameters() }], lr=self.lr_model) self.hard_update() self.update_num = 0 def select_action(self, state, train_mode=True): s = self.normalize_input(state) s = torch.tensor(state, dtype=torch.float32).to(self.device) with torch.no_grad(): action = self.policy(s).cpu().numpy() if train_mode: action += np.random.randn( self.a_dim ) * self.noise_eps * self.action_bound #Gaussian Noise else: pass action = np.clip(action, a_min=-self.action_bound, a_max=self.action_bound) return action def get_intrisic_reward(self, s, a, s_): s, a, s_ = torch.from_numpy(s).to( self.device).float(), torch.from_numpy(a).to( self.device).float(), torch.from_numpy(s_).to( self.device).float() with torch.no_grad(): feature = self.encoder(s) next_feature_pred = self.EnvForward(feature, a) next_feature = self.encoder(s_) r_i = self.eta * torch.norm(next_feature_pred - next_feature) r_i = torch.clamp(r_i, min=-0.1, max=0.1) return r_i.cpu().detach().numpy() def train(self, env, logger=None): total_step = 0 loss_pi, loss_q, loss_forward, loss_inverse = 0., 0., 0., 0. for i_episode in range(self.max_episode): obs = env.reset() s = get_state(obs) cumulative_r = 0. for i_step in range(self.max_timestep): a = self.select_action(s) obs_, r_e, done, info = env.step(a) s_ = get_state(obs_) r_i = self.get_intrisic_reward(s, a, s_) r = r_e + r_i self.memory.store(s, a, r, s_) s = s_ if len(self.memory) > self.batch_size: loss_pi, loss_q, loss_forward, loss_inverse = self.learn() cumulative_r += r_e total_step += 1 print( 'i_episode: {} total step: {} cumulative reward: {:.4f} is_success: {} ' .format(i_episode, total_step, cumulative_r, info['is_success'])) if logger is not None and i_episode % self.log_interval == 0: logger.add_scalar('Indicator/cumulative reward', cumulative_r, i_episode) logger.add_scalar('Loss/pi_loss', loss_pi, i_episode) logger.add_scalar('Loss/q_loss', loss_q, i_episode) logger.add_scalar('Loss/forward_loss', loss_forward, i_episode) logger.add_scalar('Loss/inverse_loss', loss_inverse, i_episode) if i_episode % self.evaluate_interval == 0: success_rate = self.evaluate(env) if logger is not None: logger.add_scalar('Indicator/success rate', success_rate, i_episode) if i_episode > self.save_model_start and i_episode % self.save_model_interval == 0: self.save_model(remarks='{}_{}'.format(env.spec.id, i_episode)) def evaluate(self, env, render=False): success_count = 0 for i_episode in range(self.evaluate_episode): obs = env.reset() s = get_state(obs) for i_step in range(self.max_timestep): if render: env.render() a = self.select_action(s, train_mode=False) obs_, r_e, done, info = env.step(a) s_ = get_state(obs_) s = s_ success_count += info['is_success'] return success_count / self.evaluate_episode def learn(self): s, a, r, s_ = self.memory.sample_batch(batch_size=self.batch_size) self.normalizer_s.update(s) s, s_ = self.normalize_input(s, s_) s = torch.from_numpy(s).to(self.device) a = torch.from_numpy(a).to(self.device) r = torch.from_numpy(r).to(self.device).unsqueeze(dim=1) s_ = torch.from_numpy(s_).to(self.device) #update policy and Q with torch.no_grad(): a_next_tar = self.policy_target(s_) Q_next_tar = self.Q_target(s_, a_next_tar) loss_q_tar = r + self.gamma * Q_next_tar loss_q_pred = self.Q(s, a) loss_q = F.mse_loss(loss_q_pred, loss_q_tar.detach()) self.optimizer_q.zero_grad() loss_q.backward() self.optimizer_q.step() loss_p = -self.Q(s, self.policy(s)).mean() self.optimizer_p.zero_grad() loss_p.backward() self.optimizer_p.step() self.soft_update() #update env model and encoder feature = self.encoder(s) next_feature = self.encoder(s_) a_pred = self.EnvInverse(feature, next_feature) loss_inverse = F.mse_loss(a_pred, a) next_feature_pred = self.EnvForward(feature, a) with torch.no_grad(): next_feature_tar = self.encoder(s_) loss_forward = F.mse_loss(next_feature_pred, next_feature_tar.detach()) self.optimizer_forward.zero_grad() self.optimizer_inverse.zero_grad() loss_forward.backward(retain_graph=True) loss_inverse.backward() self.optimizer_forward.step() self.optimizer_inverse.step() self.update_num += 1 return loss_p.cpu().detach().numpy(), loss_q.cpu().detach().numpy( ), loss_forward.cpu().detach().numpy(), loss_inverse.cpu().detach( ).numpy() def update_normalizer(self, states): states = np.array(states, dtype=np.float32) self.normalizer_s.update(states) def hard_update(self): self.policy_target.load_state_dict(self.policy.state_dict()) self.Q_target.load_state_dict(self.Q.state_dict()) def soft_update(self): for param, param_target in zip(self.policy.parameters(), self.policy_target.parameters()): param_target.data.copy_(param.data * self.tau + param_target.data * (1 - self.tau)) for param, param_target in zip(self.Q.parameters(), self.Q_target.parameters()): param_target.data.copy_(param.data * self.tau + param_target.data * (1 - self.tau)) def normalize_input(self, s, s_=None): s = self.normalizer_s.normalize(s) if s_ is not None: s_ = self.normalizer_s.normalize(s_) return s, s_ else: return s def save_model(self, remarks): if not os.path.exists('pretrained_models_DDPG/'): os.mkdir('pretrained_models_DDPG/') path = 'pretrained_models_DDPG/{}.pt'.format(remarks) print('Saving model to {}'.format(path)) torch.save([ self.normalizer_s.mean, self.normalizer_s.std, self.policy.state_dict() ], path) def load_model(self, remark): print('Loading models with remark {}'.format(remark)) self.normalizer_s.mean, self.normalizer_s.std, policy_model = torch.load( 'pretrained_models_DDPG/{}.pt'.format(remark), map_location=lambda storage, loc: storage) self.policy.load_state_dict(policy_model)
def __init__(self, input_shape, action_n, N, alpha=0.5, beta=0.5, beta_decay=50000, gamma=0.99): self.shape = input_shape self.batch_size = input_shape[0] # Prioritized Replay Memory self.pr = PrioritizedReplayBuf(N=N, alpha=alpha, beta=beta, beta_decay=beta_decay, batch_size=self.batch_size) # Importance Sampling weights self.is_w = tf.placeholder(shape=[self.batch_size, 1], dtype=tf.float32) Q = QFunction(input_shape, action_n, scope="Q") target_Q = QFunction(input_shape, action_n, scope="target_Q") # Forward Q self.s = tf.placeholder(shape=[None] + input_shape[1:], dtype=tf.float32) self.a = tf.placeholder(shape=[self.batch_size, 1], dtype=tf.int32) self.probs = Q(self.s, s_bias=False) # add offset first = tf.expand_dims(tf.range(self.batch_size), axis=1) indices = tf.concat(values=[first, self.a], concat_dim=1) # gather corresiponding q_vals self.q_val = tf.expand_dims(tf.gather_nd(self.probs, indices), axis=1) # TD target self.done = tf.placeholder(shape=[self.batch_size, 1], dtype=tf.float32) self.r = tf.placeholder(shape=[self.batch_size, 1], dtype=tf.float32) self.s_ = tf.placeholder(shape=input_shape, dtype=tf.float32) # D-DQN a_max = tf.expand_dims(tf.argmax(Q(self.s_, reuse=True), axis=1), axis=1) a_max = tf.to_int32(a_max) target_q_val = tf.expand_dims(tf.gather_nd( target_Q(self.s_), tf.concat(values=[first, a_max], concat_dim=1)), axis=1) self.y = self.r + gamma * (1.0 - self.done) * target_q_val # Error Clipping # TD-error self.delta = Hurber_loss(self.q_val, self.y) # Importance sampling max_is = tf.reduce_max(self.is_w) self.loss = tf.reduce_mean((self.is_w / max_is) * self.delta) # Update Q # reducing step-size by a factor of four opt = tf.train.RMSPropOptimizer(0.00025 / 4, 0.99, 0.0, 1e-6) grads_and_vars = opt.compute_gradients(self.loss) grads_and_vars = [[grad, var] for grad, var in grads_and_vars \ if grad is not None and (var.name.startswith("Q") or var.name.startswith("shared"))] self.train_op = opt.apply_gradients(grads_and_vars) # Update target Q self.target_train_op = copy_params(Q, target_Q)
class DDPG_Her_Agent(): def __init__(self, args, env_params): self.o_dim = env_params['o_dim'] self.a_dim = env_params['a_dim'] self.g_dim = env_params['g_dim'] self.action_bound = env_params['action_max'] self.lr = args.lr self.l2_coefficient = args.l2_coefficient self.gamma = args.gamma self.batch_size = args.batch_size self.device = torch.device(args.device) self.tau = args.tau self.noise_eps = args.noise_eps self.policy = Policy(o_dim=self.o_dim, a_dim=self.a_dim, g_dim=self.g_dim).to(self.device) self.policy_target = Policy(o_dim=self.o_dim, a_dim=self.a_dim, g_dim=self.g_dim).to(self.device) self.Q = QFunction(o_dim=self.o_dim, a_dim=self.a_dim, g_dim=self.g_dim).to(self.device) self.Q_target = QFunction(o_dim=self.o_dim, a_dim=self.a_dim, g_dim=self.g_dim).to(self.device) sync_networks(self.policy) sync_networks(self.Q) self.optimizer_p = optim.Adam(self.policy.parameters(), lr=self.lr) self.optimizer_q = optim.Adam(self.Q.parameters(), lr=self.lr) self.normalizer_o = Normalizer(size=self.o_dim, eps=1e-2, clip_range=1.) self.normalizer_g = Normalizer(size=self.g_dim, eps=1e-2, clip_range=1.) self.hard_update() def hard_update(self): self.policy_target.load_state_dict(self.policy.state_dict()) self.Q_target.load_state_dict(self.Q.state_dict()) def soft_update(self): for param, param_target in zip(self.policy.parameters(), self.policy_target.parameters()): param_target.data.copy_(param.data * self.tau + param_target.data * (1 - self.tau)) for param, param_target in zip(self.Q.parameters(), self.Q_target.parameters()): param_target.data.copy_(param.data * self.tau + param_target.data * (1 - self.tau)) def normalize_input(self, o, g, o_=None): o = self.normalizer_o.normalize(o) g = self.normalizer_g.normalize(g) if o_ is not None: o_ = self.normalizer_o.normalize(o_) return o, g, o_ else: return o, g def select_action(self, observation, goal, train_mode=True): observation, goal = self.normalize_input(observation, goal) observation, goal = torch.tensor(observation, dtype=torch.float32).to( self.device), torch.tensor(goal, dtype=torch.float32).to(self.device) o_g = torch.cat([observation, goal], dim=0) with torch.no_grad(): action = self.policy(o_g).cpu().numpy() if train_mode: action += np.random.randn( self.a_dim ) * self.noise_eps * self.action_bound #Gaussian Noise else: pass action = np.clip(action, a_min=-self.action_bound, a_max=self.action_bound) return action def learn(self, memory): o, a, r, o_, g = memory.sample_batch(batch_size=self.batch_size) o, g, o_ = self.normalize_input(o, g, o_) o = torch.from_numpy(o).to(self.device) a = torch.from_numpy(a).to(self.device) r = torch.from_numpy(r).to(self.device).unsqueeze(dim=1) o_ = torch.from_numpy(o_).to(self.device) g = torch.from_numpy(g).to(self.device) #update Q a_next_target = self.policy_target(torch.cat([o_, g], dim=1)) q_tar = r + self.gamma * self.Q_target( torch.cat([o_, a_next_target, g], dim=1)) q_tar = torch.clamp(q_tar, -1 / (1 - self.gamma), 0) q_pred = self.Q(torch.cat([o, a, g], dim=1)) loss_q = F.mse_loss(q_pred, q_tar.detach()) self.optimizer_q.zero_grad() loss_q.backward() sync_grads(self.Q) self.optimizer_q.step() #update policy a_eval = self.policy(torch.cat([o, g], dim=1)) loss_p = -self.Q(torch.cat( [o, a_eval, g], dim=1)).mean() + self.l2_coefficient * ( a_eval / self.action_bound).pow(2).mean() #actions self.optimizer_p.zero_grad() loss_p.backward() sync_grads(self.policy) self.optimizer_p.step() return loss_q.cpu().item(), loss_p.cpu().item(), q_pred.mean().cpu( ).item() def update_normalizer(self, observations, goals): observations, goals = np.array( observations, dtype=np.float32), np.array(goals, dtype=np.float32) self.normalizer_o.update(observations) self.normalizer_g.update(goals) def save_model(self, remarks): if not os.path.exists('pretrained_models_DDPG/'): os.mkdir('pretrained_models_DDPG/') path = 'pretrained_models_DDPG/{}.pt'.format(remarks) print('Saving model to {}'.format(path)) torch.save([ self.normalizer_o.mean, self.normalizer_o.std, self.normalizer_g.mean, self.normalizer_g.std, self.policy.state_dict() ], path) def load_model(self, remarks): print('Loading models with remark {}'.format(remarks)) self.normalizer_o.mean, self.normalizer_o.std, self.normalizer_g.mean, self.normalizer_g.std, policy_model = torch.load( 'pretrained_models_DDPG/{}.pt'.format(remarks), map_location=lambda storage, loc: storage) self.policy.load_state_dict(policy_model)