def main(): parser = argparse.ArgumentParser('Parse configuration file') parser.add_argument('--config', type=str, default='configs/model.config') parser.add_argument('--gpu', default=False, action='store_true') args = parser.parse_args() config_file = args.config model_config = configparser.RawConfigParser() model_config.read(config_file) env_config = configparser.RawConfigParser() env_config.read('configs/env.config') # configure paths output_dir = os.path.splitext(os.path.basename(args.config))[0] output_dir = os.path.join('data', output_dir) if os.path.exists(output_dir): # raise FileExistsError('Output folder already exists') print('Output folder already exists') else: os.mkdir(output_dir) log_file = os.path.join(output_dir, 'output.log') shutil.copy(args.config, output_dir) initialized_weights = os.path.join(output_dir, 'initialized_model.pth') trained_weights = os.path.join(output_dir, 'trained_model.pth') # configure logging file_handler = logging.FileHandler(log_file, mode='w') stdout_handler = logging.StreamHandler(sys.stdout) logging.basicConfig(level=logging.INFO, handlers=[stdout_handler, file_handler], format='%(asctime)s, %(levelname)s: %(message)s', datefmt="%Y-%m-%d %H:%M:%S") # configure device device = torch.device("cuda:0" if torch.cuda.is_available() and args.gpu else "cpu") logging.info('Using device: {}'.format(device)) # configure model state_dim = model_config.getint('model', 'state_dim') kinematic = env_config.getboolean('agent', 'kinematic') model = ValueNetwork(state_dim=state_dim, fc_layers=[100, 100, 100], kinematic=kinematic).to(device) logging.debug('Trainable parameters: {}'.format([name for name, p in model.named_parameters() if p.requires_grad])) # load simulated data from ORCA traj_dir = model_config.get('init', 'traj_dir') gamma = model_config.getfloat('model', 'gamma') capacity = model_config.getint('train', 'capacity') memory = initialize_memory(traj_dir, gamma, capacity, kinematic, device) # initialize model if os.path.exists(initialized_weights): model.load_state_dict(torch.load(initialized_weights)) logging.info('Load initialized model weights') else: initialize_model(model, memory, model_config, device) torch.save(model.state_dict(), initialized_weights) logging.info('Finish initializing model. Model saved') # train the model train(model, memory, model_config, env_config, device, trained_weights) torch.save(model.state_dict(), trained_weights) logging.info('Finish initializing training model. Model saved')
class Agent(): def __init__(self, state_size, action_size, num_agents): state_dim = state_size #agent_input_state_dim = state_size*2 # Previos state is passed in with with the current state. action_dim = action_size self.num_agents = num_agents max_size = 100000 ### self.replay = Replay(max_size) hidden_dim = 128 self.critic_net = ValueNetwork(state_dim, action_dim, hidden_dim).to(device) self.target_critic_net = ValueNetwork(state_dim, action_dim, hidden_dim).to(device) self.actor_net = PolicyNetwork(state_dim, action_dim, hidden_dim).to(device) self.target_actor_net = PolicyNetwork(state_dim, action_dim, hidden_dim).to(device) for target_param, param in zip(self.target_critic_net.parameters(), self.critic_net.parameters()): target_param.data.copy_(param.data) for target_param, param in zip(self.target_actor_net.parameters(), self.actor_net.parameters()): target_param.data.copy_(param.data) self.critic_optimizer = optim.Adam(self.critic_net.parameters(), lr=CRITIC_LEARNING_RATE) self.actor_optimizer = optim.Adam(self.actor_net.parameters(), lr=ACTOR_LEARNING_RATE) def get_action(self, state): return self.actor_net.get_action(state)[0] def add_replay(self, state, action, reward, next_state, done): for i in range(self.num_agents): self.replay.add(state[i], action[i], reward[i], next_state[i], done[i]) def learning_step(self): #Check if relay buffer contains enough samples for 1 batch if (self.replay.cursize < BATCH_SIZE): return #Get Samples state, action, reward, next_state, done = self.replay.get(BATCH_SIZE) #calculate loss actor_loss = self.critic_net(state, self.actor_net(state)) actor_loss = -actor_loss.mean() next_action = self.target_actor_net(next_state) target_value = self.target_critic_net(next_state, next_action.detach()) expected_value = reward + (1.0 - done) * DISCOUNT_RATE * target_value value = self.critic_net(state, action) critic_loss = F.mse_loss(value, expected_value.detach()) #backprop self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() #soft update self.soft_update(self.critic_net, self.target_critic_net, TAU) self.soft_update(self.actor_net, self.target_actor_net, TAU) def save(self, name): torch.save(self.critic_net.state_dict(), name + "_critic") torch.save(self.actor_net.state_dict(), name + "_actor") def load(self, name): self.critic_net.load_state_dict(torch.load(name + "_critic")) self.critic_net.eval() self.actor_net.load_state_dict(torch.load(name + "_actor")) self.actor_net.eval() for target_param, param in zip(self.target_critic_net.parameters(), self.critic_net.parameters()): target_param.data.copy_(param.data) for target_param, param in zip(self.target_actor_net.parameters(), self.actor_net.parameters()): target_param.data.copy_(param.data) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class DQNAgent(object): def __init__(self, env, args, work_dir): self.env = env self.args = args self.work_dir = work_dir self.n_action = self.env.action_space.n self.arr_actions = np.arange(self.n_action) self.memory = ReplayMemory(self.args.buffer_size, self.args.device) self.qNetwork = ValueNetwork(self.n_action, self.env).to(self.args.device) self.targetNetwork = ValueNetwork(self.n_action, self.env).to(self.args.device) self.qNetwork.train() self.targetNetwork.eval() self.optimizer = optim.RMSprop(self.qNetwork.parameters(), lr=0.00025, eps=0.001, alpha=0.95) self.crit = nn.MSELoss() self.eps = max(self.args.eps, self.args.eps_min) self.eps_delta = ( self.eps - self.args.eps_min) / self.args.exploration_decay_speed def reset(self): return torch.cat([preprocess_state(self.env.reset(), self.env)] * 4, 1) def select_action(self, state): action_prob = np.zeros(self.n_action, np.float32) action_prob.fill(self.eps / self.n_action) max_q, max_q_index = self.qNetwork(Variable(state.to( self.args.device))).data.cpu().max(1) action_prob[max_q_index[0]] += 1 - self.eps action = np.random.choice(self.arr_actions, p=action_prob) next_state, reward, done, _ = self.env.step(action) next_state = torch.cat( [state.narrow(1, 1, 3), preprocess_state(next_state, self.env)], 1) self.memory.push( (state, torch.LongTensor([int(action)]), torch.Tensor([reward]), next_state, torch.Tensor([done]))) return next_state, reward, done, max_q[0] def run(self): state = self.reset() # init buffer for _ in range(self.args.buffer_init_size): next_state, _, done, _ = self.select_action(state) state = self.reset() if done else next_state total_frame = 0 reward_list = np.zeros(self.args.log_size, np.float32) qval_list = np.zeros(self.args.log_size, np.float32) start_time = time.time() for epi in count(): reward_list[epi % self.args.log_size] = 0 qval_list[epi % self.args.log_size] = -1e9 state = self.reset() done = False ep_len = 0 if epi % self.args.save_freq == 0: model_file = os.path.join(self.work_dir, 'model.th') with open(model_file, 'wb') as f: torch.save(self.qNetwork, f) while not done: if total_frame % self.args.sync_period == 0: self.targetNetwork.load_state_dict( self.qNetwork.state_dict()) self.eps = max(self.args.eps_min, self.eps - self.eps_delta) next_state, reward, done, qval = self.select_action(state) reward_list[epi % self.args.log_size] += reward qval_list[epi % self.args.log_size] = max( qval_list[epi % self.args.log_size], qval) state = next_state total_frame += 1 ep_len += 1 if ep_len % self.args.learn_freq == 0: batch_state, batch_action, batch_reward, batch_next_state, batch_done = self.memory.sample( self.args.batch_size) batch_q = self.qNetwork(batch_state).gather( 1, batch_action.unsqueeze(1)).squeeze(1) batch_next_q = self.targetNetwork(batch_next_state).detach( ).max(1)[0] * self.args.gamma * (1 - batch_done) loss = self.crit(batch_q, batch_reward + batch_next_q) self.optimizer.zero_grad() loss.backward() self.optimizer.step() output_str = 'episode %d frame %d time %.2fs cur_rew %.3f mean_rew %.3f cur_maxq %.3f mean_maxq %.3f' % ( epi, total_frame, time.time() - start_time, reward_list[epi % self.args.log_size], np.mean(reward_list), qval_list[epi % self.args.log_size], np.mean(qval_list)) print(output_str) logging.info(output_str)
bymax = env_config.getfloat('sim', 'ymax') xmin = env_config.getfloat('visualization', 'xmin') xmax = env_config.getfloat('visualization', 'xmax') ymin = env_config.getfloat('visualization', 'ymin') ymax = env_config.getfloat('visualization', 'ymax') crossing_radius = env_config.getfloat('sim', 'crossing_radius') kinematic = env_config.getboolean('agent', 'kinematic') radius = env_config.getfloat('agent', 'radius') device = torch.device('cpu') test_env = ENV(config=env_config, phase='test') test_env.reset(case) model = ValueNetwork(state_dim=state_dim, fc_layers=[100, 100, 100], kinematic=kinematic) model.load_state_dict( torch.load(weight_path, map_location=lambda storage, loc: storage)) _, state_sequences, _, _ = run_one_episode(model, 'test', test_env, gamma, None, kinematic, device) positions = list() colors = list() counter = list() line_positions = list() for i in range(len(state_sequences[0])): counter.append(i) if state_sequences[0][i] is None: p0 = positions[-4][0] c0 = 'tab:red' h0 = 0 else: p0 = (state_sequences[0][i].px, state_sequences[0][i].py)
class SAC(object): def __init__(self, num_inputs, action_space, args): self.num_inputs = num_inputs self.action_space = action_space.shape[0] self.gamma = args.gamma self.tau = args.tau self.policy_type = args.policy self.target_update_interval = args.target_update_interval self.automatic_entropy_tuning = args.automatic_entropy_tuning self.device = torch.device("cuda" if args.cuda else "cpu") self.critic = QNetwork(self.num_inputs, self.action_space, args.hidden_size).to(device=self.device) self.critic_optim = Adam(self.critic.parameters(), lr=args.lr) if self.policy_type == "Gaussian": self.alpha = args.alpha # Target Entropy = −dim(A) (e.g. , -6 for HalfCheetah-v2) as given in the paper if self.automatic_entropy_tuning == True: self.target_entropy = -torch.prod( torch.Tensor(action_space.shape).to(self.device)).item() self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device) self.alpha_optim = Adam([self.log_alpha], lr=args.lr) self.policy = GaussianPolicy(self.num_inputs, self.action_space, args.hidden_size).to(self.device) self.policy_optim = Adam(self.policy.parameters(), lr=args.lr) self.value = ValueNetwork(self.num_inputs, args.hidden_size).to(self.device) self.value_target = ValueNetwork(self.num_inputs, args.hidden_size).to(self.device) self.value_optim = Adam(self.value.parameters(), lr=args.lr) hard_update(self.value_target, self.value) else: self.policy = DeterministicPolicy(self.num_inputs, self.action_space, args.hidden_size).to(self.device) self.policy_optim = Adam(self.policy.parameters(), lr=args.lr) self.critic_target = QNetwork(self.num_inputs, self.action_space, args.hidden_size).to(self.device) hard_update(self.critic_target, self.critic) def select_action(self, state, eval=False): state = torch.FloatTensor(state).to(self.device).unsqueeze(0) if eval == False: self.policy.train() action, _, _ = self.policy.sample(state) else: self.policy.eval() _, _, action = self.policy.sample(state) action = action.detach().cpu().numpy() return action[0] def update_parameters(self, state_batch, action_batch, reward_batch, next_state_batch, mask_batch, updates): state_batch = torch.FloatTensor(state_batch).to(self.device) next_state_batch = torch.FloatTensor(next_state_batch).to(self.device) action_batch = torch.FloatTensor(action_batch).to(self.device) reward_batch = torch.FloatTensor(reward_batch).to( self.device).unsqueeze(1) mask_batch = torch.FloatTensor(mask_batch).to(self.device).unsqueeze(1) qf1, qf2 = self.critic( state_batch, action_batch ) # Two Q-functions to mitigate positive bias in the policy improvement step pi, log_pi, _ = self.policy.sample(state_batch) if self.policy_type == "Gaussian": if self.automatic_entropy_tuning: alpha_loss = -(self.log_alpha * (log_pi + self.target_entropy).detach()).mean() self.alpha_optim.zero_grad() alpha_loss.backward() self.alpha_optim.step() self.alpha = self.log_alpha.exp() alpha_logs = torch.tensor(self.alpha) # For TensorboardX logs else: alpha_loss = torch.tensor(0.).to(self.device) alpha_logs = torch.tensor(self.alpha) # For TensorboardX logs vf = self.value( state_batch ) # separate function approximator for the soft value can stabilize training. with torch.no_grad(): vf_next_target = self.value_target(next_state_batch) next_q_value = reward_batch + mask_batch * self.gamma * ( vf_next_target) else: alpha_loss = torch.tensor(0.).to(self.device) alpha_logs = self.alpha # For TensorboardX logs with torch.no_grad(): next_state_action, _, _, _, _, = self.policy.sample( next_state_batch) # Use a target critic network for deterministic policy and eradicate the value value network completely. qf1_next_target, qf2_next_target = self.critic_target( next_state_batch, next_state_action) min_qf_next_target = torch.min(qf1_next_target, qf2_next_target) next_q_value = reward_batch + mask_batch * self.gamma * ( min_qf_next_target) qf1_loss = F.mse_loss( qf1, next_q_value ) # JQ = 𝔼(st,at)~D[0.5(Q1(st,at) - r(st,at) - γ(𝔼st+1~p[V(st+1)]))^2] qf2_loss = F.mse_loss( qf2, next_q_value ) # JQ = 𝔼(st,at)~D[0.5(Q1(st,at) - r(st,at) - γ(𝔼st+1~p[V(st+1)]))^2] qf1_pi, qf2_pi = self.critic(state_batch, pi) min_qf_pi = torch.min(qf1_pi, qf2_pi) if self.policy_type == "Gaussian": vf_target = min_qf_pi - (self.alpha * log_pi) value_loss = F.mse_loss( vf, vf_target.detach() ) # JV = 𝔼st~D[0.5(V(st) - (𝔼at~π[Qmin(st,at) - α * log π(at|st)]))^2] policy_loss = ((self.alpha * log_pi) - min_qf_pi).mean( ) # Jπ = 𝔼st∼D,εt∼N[α * logπ(f(εt;st)|st) − Q(st,f(εt;st))] # Regularization Loss # mean_loss = 0.001 * mean.pow(2).mean() # std_loss = 0.001 * log_std.pow(2).mean() # policy_loss += mean_loss + std_loss self.critic_optim.zero_grad() qf1_loss.backward() self.critic_optim.step() self.critic_optim.zero_grad() qf2_loss.backward() self.critic_optim.step() if self.policy_type == "Gaussian": self.value_optim.zero_grad() value_loss.backward() self.value_optim.step() else: value_loss = torch.tensor(0.).to(self.device) self.policy_optim.zero_grad() policy_loss.backward() self.policy_optim.step() """ We update the target weights to match the current value function weights periodically Update target parameter after every n(args.target_update_interval) updates """ if updates % self.target_update_interval == 0 and self.policy_type == "Deterministic": soft_update(self.critic_target, self.critic, self.tau) elif updates % self.target_update_interval == 0 and self.policy_type == "Gaussian": soft_update(self.value_target, self.value, self.tau) return value_loss.item(), qf1_loss.item(), qf2_loss.item( ), policy_loss.item(), alpha_loss.item(), alpha_logs.item() # Save model parameters def save_model(self, env_name, suffix="", actor_path=None, critic_path=None, value_path=None): if not os.path.exists('models/'): os.makedirs('models/') if actor_path is None: actor_path = "models/sac_actor_{}_{}".format(env_name, suffix) if critic_path is None: critic_path = "models/sac_critic_{}_{}".format(env_name, suffix) if value_path is None: value_path = "models/sac_value_{}_{}".format(env_name, suffix) print('Saving models to {}, {} and {}'.format(actor_path, critic_path, value_path)) torch.save(self.value.state_dict(), value_path) torch.save(self.policy.state_dict(), actor_path) torch.save(self.critic.state_dict(), critic_path) # Load model parameters def load_model(self, actor_path, critic_path, value_path): print('Loading models from {}, {} and {}'.format( actor_path, critic_path, value_path)) if actor_path is not None: self.policy.load_state_dict(torch.load(actor_path)) if critic_path is not None: self.critic.load_state_dict(torch.load(critic_path)) if value_path is not None: self.value.load_state_dict(torch.load(value_path))
class SAC(object): def __init__(self, num_inputs, action_space, args): self.num_inputs = num_inputs self.action_space = action_space.shape[0] self.gamma = args.gamma self.tau = args.tau self.policy_type = args.policy self.target_update_interval = args.target_update_interval self.automatic_entropy_tuning = args.automatic_entropy_tuning self.critic = QNetwork(self.num_inputs, self.action_space, args.hidden_size) self.critic_optim = Adam(self.critic.parameters(), lr=args.lr) if self.policy_type == "Gaussian": self.alpha = args.alpha # Target Entropy = −dim(A) (e.g. , -6 for HalfCheetah-v2) as given in the paper if self.automatic_entropy_tuning == True: self.target_entropy = -torch.prod( torch.Tensor(action_space.shape)).item() self.log_alpha = torch.zeros(1, requires_grad=True) self.alpha_optim = Adam([self.log_alpha], lr=args.lr) else: pass self.policy = GaussianPolicy(self.num_inputs, self.action_space, args.hidden_size) self.policy_optim = Adam(self.policy.parameters(), lr=args.lr) self.value = ValueNetwork(self.num_inputs, args.hidden_size) self.value_target = ValueNetwork(self.num_inputs, args.hidden_size) self.value_optim = Adam(self.value.parameters(), lr=args.lr) hard_update(self.value_target, self.value) else: self.policy = DeterministicPolicy(self.num_inputs, self.action_space, args.hidden_size) self.policy_optim = Adam(self.policy.parameters(), lr=args.lr) self.critic_target = QNetwork(self.num_inputs, self.action_space, args.hidden_size) hard_update(self.critic_target, self.critic) def select_action(self, state, eval=False): state = torch.FloatTensor(state).unsqueeze(0) if eval == False: self.policy.train() action, _, _, _, _ = self.policy.sample(state) else: self.policy.eval() _, _, _, action, _ = self.policy.sample(state) if self.policy_type == "Gaussian": action = torch.tanh(action) else: pass #action = torch.tanh(action) action = action.detach().cpu().numpy() return action[0] def update_parameters(self, state_batch, action_batch, reward_batch, next_state_batch, mask_batch, updates): state_batch = torch.FloatTensor(state_batch) next_state_batch = torch.FloatTensor(next_state_batch) action_batch = torch.FloatTensor(action_batch) reward_batch = torch.FloatTensor(reward_batch).unsqueeze(1) mask_batch = torch.FloatTensor(np.float32(mask_batch)).unsqueeze(1) """ Use two Q-functions to mitigate positive bias in the policy improvement step that is known to degrade performance of value based methods. Two Q-functions also significantly speed up training, especially on harder task. """ expected_q1_value, expected_q2_value = self.critic( state_batch, action_batch) new_action, log_prob, _, mean, log_std = self.policy.sample( state_batch) if self.policy_type == "Gaussian": if self.automatic_entropy_tuning: """ Alpha Loss """ alpha_loss = -( self.log_alpha * (log_prob + self.target_entropy).detach()).mean() self.alpha_optim.zero_grad() alpha_loss.backward() self.alpha_optim.step() self.alpha = self.log_alpha.exp() alpha_logs = self.alpha.clone() # For TensorboardX logs else: alpha_loss = torch.tensor(0.) alpha_logs = self.alpha # For TensorboardX logs """ Including a separate function approximator for the soft value can stabilize training. """ expected_value = self.value(state_batch) target_value = self.value_target(next_state_batch) next_q_value = reward_batch + mask_batch * self.gamma * ( target_value).detach() else: """ There is no need in principle to include a separate function approximator for the state value. We use a target critic network for deterministic policy and eradicate the value value network completely. """ alpha_loss = torch.tensor(0.) alpha_logs = self.alpha # For TensorboardX logs next_state_action, _, _, _, _, = self.policy.sample( next_state_batch) target_critic_1, target_critic_2 = self.critic_target( next_state_batch, next_state_action) target_critic = torch.min(target_critic_1, target_critic_2) next_q_value = reward_batch + mask_batch * self.gamma * ( target_critic).detach() """ Soft Q-function parameters can be trained to minimize the soft Bellman residual JQ = 𝔼(st,at)~D[0.5(Q1(st,at) - r(st,at) - γ(𝔼st+1~p[V(st+1)]))^2] ∇JQ = ∇Q(st,at)(Q(st,at) - r(st,at) - γV(target)(st+1)) """ q1_value_loss = F.mse_loss(expected_q1_value, next_q_value) q2_value_loss = F.mse_loss(expected_q2_value, next_q_value) q1_new, q2_new = self.critic(state_batch, new_action) expected_new_q_value = torch.min(q1_new, q2_new) if self.policy_type == "Gaussian": """ Including a separate function approximator for the soft value can stabilize training and is convenient to train simultaneously with the other networks Update the V towards the min of two Q-functions in order to reduce overestimation bias from function approximation error. JV = 𝔼st~D[0.5(V(st) - (𝔼at~π[Qmin(st,at) - α * log π(at|st)]))^2] ∇JV = ∇V(st)(V(st) - Q(st,at) + (α * logπ(at|st))) """ next_value = expected_new_q_value - (self.alpha * log_prob) value_loss = F.mse_loss(expected_value, next_value.detach()) else: pass """ Reparameterization trick is used to get a low variance estimator f(εt;st) = action sampled from the policy εt is an input noise vector, sampled from some fixed distribution Jπ = 𝔼st∼D,εt∼N[α * logπ(f(εt;st)|st) − Q(st,f(εt;st))] ∇Jπ = ∇log π + ([∇at (α * logπ(at|st)) − ∇at Q(st,at)])∇f(εt;st) """ policy_loss = ((self.alpha * log_prob) - expected_new_q_value).mean() # Regularization Loss mean_loss = 0.001 * mean.pow(2).mean() std_loss = 0.001 * log_std.pow(2).mean() policy_loss += mean_loss + std_loss self.critic_optim.zero_grad() q1_value_loss.backward() self.critic_optim.step() self.critic_optim.zero_grad() q2_value_loss.backward() self.critic_optim.step() if self.policy_type == "Gaussian": self.value_optim.zero_grad() value_loss.backward() self.value_optim.step() else: value_loss = torch.tensor(0.) self.policy_optim.zero_grad() policy_loss.backward() self.policy_optim.step() """ We update the target weights to match the current value function weights periodically Update target parameter after every n(args.target_update_interval) updates """ if updates % self.target_update_interval == 0 and self.policy_type == "Deterministic": soft_update(self.critic_target, self.critic, self.tau) elif updates % self.target_update_interval == 0 and self.policy_type == "Gaussian": soft_update(self.value_target, self.value, self.tau) return value_loss.item(), q1_value_loss.item(), q2_value_loss.item( ), policy_loss.item(), alpha_loss.item(), alpha_logs # Save model parameters def save_model(self, env_name, suffix="", actor_path=None, critic_path=None, value_path=None): if not os.path.exists('models/'): os.makedirs('models/') if actor_path is None: actor_path = "models/sac_actor_{}_{}".format(env_name, suffix) if critic_path is None: critic_path = "models/sac_critic_{}_{}".format(env_name, suffix) if value_path is None: value_path = "models/sac_value_{}_{}".format(env_name, suffix) print('Saving models to {}, {} and {}'.format(actor_path, critic_path, value_path)) torch.save(self.value.state_dict(), value_path) torch.save(self.policy.state_dict(), actor_path) torch.save(self.critic.state_dict(), critic_path) # Load model parameters def load_model(self, actor_path, critic_path, value_path): print('Loading models from {}, {} and {}'.format( actor_path, critic_path, value_path)) if actor_path is not None: self.policy.load_state_dict(torch.load(actor_path)) if critic_path is not None: self.critic.load_state_dict(torch.load(critic_path)) if value_path is not None: self.value.load_state_dict(torch.load(value_path))
class DDPG: def __init__(self, cfg): self.device = cfg.device self.gamma = cfg.gamma self.batch_size = cfg.batch_size self.value_net = ValueNetwork(cfg.state_dim, cfg.action_dim, cfg.hidden_dim).to(self.device) self.policy_net = PolicyNetwork(cfg.state_dim, cfg.action_dim, cfg.hidden_dim).to(self.device) self.target_value_net = ValueNetwork(cfg.state_dim, cfg.action_dim, cfg.hidden_dim).to(self.device) self.target_value_net.load_state_dict(self.value_net.state_dict()) self.target_policy_net = PolicyNetwork(cfg.state_dim, cfg.action_dim, cfg.hidden_dim).to(self.device) self.target_policy_net.load_state_dict(self.policy_net.state_dict()) self.soft_tau = cfg.soft_tau self.value_lr = cfg.value_lr self.policy_lr = cfg.policy_lr self.value_optimizer = optim.Adam(self.value_net.parameters(), lr=self.value_lr) self.policy_optimizer = optim.Adam(self.policy_net.parameters(), lr=self.policy_lr) # mean squared error self.value_criterion = nn.MSELoss() self.replay_buffer = ReplayBuffer(cfg.replay_buffer_size) def update(self, cfg): state, action, reward, next_state, done = self.replay_buffer.sample( cfg.batch_size) # print(np.shape(state), np.shape(action), np.shape(reward), np.shape(next_state), np.shape(done)) # (128, 3) (128, 1) (128,) (128, 3) (128,) state = torch.FloatTensor(state).to(cfg.device) action = torch.FloatTensor(action).to(cfg.device) reward = torch.FloatTensor(reward).unsqueeze(1).to(cfg.device) next_state = torch.FloatTensor(next_state).to(cfg.device) done = torch.FloatTensor(done).unsqueeze(1).to(cfg.device) self.value_net(state, self.policy_net(state)) # Actor Loss policy_loss = self.value_net(state, self.policy_net(state)) policy_loss = -policy_loss.mean() next_action = self.target_policy_net(next_state) target_value = self.target_value_net(next_state, next_action.detach()) TD_target = reward + (1.0 - done) * self.gamma * target_value TD_target = torch.clamp(TD_target, -np.inf, np.inf) value = self.value_net(state, action) # Critic Loss value_loss = self.value_criterion(value, TD_target.detach()) self.policy_optimizer.zero_grad() policy_loss.backward() self.policy_optimizer.step() self.value_optimizer.zero_grad() value_loss.backward() self.value_optimizer.step() # Update target network for target_param, param in zip(self.target_value_net.parameters(), self.value_net.parameters()): target_param.data.copy_(target_param.data * (1.0 - self.soft_tau) + param.data * self.soft_tau) for target_param, param in zip(self.target_policy_net.parameters(), self.policy_net.parameters()): target_param.data.copy_(target_param.data * (1.0 - self.soft_tau) + param.data * self.soft_tau)
class SAC: def __init__(self, env_name, n_states, n_actions, memory_size, batch_size, gamma, alpha, lr, action_bounds, reward_scale): self.env_name = env_name self.n_states = n_states self.n_actions = n_actions self.memory_size = memory_size self.batch_size = batch_size self.gamma = gamma self.alpha = alpha self.lr = lr self.action_bounds = action_bounds self.reward_scale = reward_scale self.memory = Memory(memory_size=self.memory_size) self.device = "cuda" if torch.cuda.is_available() else "cpu" self.policy_network = PolicyNetwork( n_states=self.n_states, n_actions=self.n_actions, action_bounds=self.action_bounds).to(self.device) self.q_value_network1 = QvalueNetwork(n_states=self.n_states, n_actions=self.n_actions).to( self.device) self.q_value_network2 = QvalueNetwork(n_states=self.n_states, n_actions=self.n_actions).to( self.device) self.value_network = ValueNetwork(n_states=self.n_states).to( self.device) self.value_target_network = ValueNetwork(n_states=self.n_states).to( self.device) self.value_target_network.load_state_dict( self.value_network.state_dict()) self.value_target_network.eval() self.value_loss = torch.nn.MSELoss() self.q_value_loss = torch.nn.MSELoss() self.value_opt = Adam(self.value_network.parameters(), lr=self.lr) self.q_value1_opt = Adam(self.q_value_network1.parameters(), lr=self.lr) self.q_value2_opt = Adam(self.q_value_network2.parameters(), lr=self.lr) self.policy_opt = Adam(self.policy_network.parameters(), lr=self.lr) def store(self, state, reward, done, action, next_state): state = from_numpy(state).float().to("cpu") reward = torch.Tensor([reward]).to("cpu") done = torch.Tensor([done]).to("cpu") action = torch.Tensor([action]).to("cpu") next_state = from_numpy(next_state).float().to("cpu") self.memory.add(state, reward, done, action, next_state) def unpack(self, batch): batch = Transition(*zip(*batch)) states = torch.cat(batch.state).view(self.batch_size, self.n_states).to(self.device) rewards = torch.cat(batch.reward).view(self.batch_size, 1).to(self.device) dones = torch.cat(batch.done).view(self.batch_size, 1).to(self.device) actions = torch.cat(batch.action).view(-1, self.n_actions).to(self.device) next_states = torch.cat(batch.next_state).view( self.batch_size, self.n_states).to(self.device) return states, rewards, dones, actions, next_states def train(self): if len(self.memory) < self.batch_size: return 0, 0, 0 else: batch = self.memory.sample(self.batch_size) states, rewards, dones, actions, next_states = self.unpack(batch) # Calculating the value target reparam_actions, log_probs = self.policy_network.sample_or_likelihood( states) q1 = self.q_value_network1(states, reparam_actions) q2 = self.q_value_network2(states, reparam_actions) q = torch.min(q1, q2) target_value = q.detach() - self.alpha * log_probs.detach() value = self.value_network(states) value_loss = self.value_loss(value, target_value) # Calculating the Q-Value target with torch.no_grad(): target_q = self.reward_scale * rewards + \ self.gamma * self.value_target_network(next_states) * (1 - dones) q1 = self.q_value_network1(states, actions) q2 = self.q_value_network2(states, actions) q1_loss = self.q_value_loss(q1, target_q) q2_loss = self.q_value_loss(q2, target_q) policy_loss = (self.alpha * log_probs - q).mean() self.policy_opt.zero_grad() policy_loss.backward() self.policy_opt.step() self.value_opt.zero_grad() value_loss.backward() self.value_opt.step() self.q_value1_opt.zero_grad() q1_loss.backward() self.q_value1_opt.step() self.q_value2_opt.zero_grad() q2_loss.backward() self.q_value2_opt.step() self.soft_update_target_network(self.value_network, self.value_target_network) return value_loss.item(), 0.5 * ( q1_loss + q2_loss).item(), policy_loss.item() def choose_action(self, states): states = np.expand_dims(states, axis=0) states = from_numpy(states).float().to(self.device) action, _ = self.policy_network.sample_or_likelihood(states) return action.detach().cpu().numpy()[0] @staticmethod def soft_update_target_network(local_network, target_network, tau=0.005): for target_param, local_param in zip(target_network.parameters(), local_network.parameters()): target_param.data.copy_(tau * local_param.data + (1 - tau) * target_param.data) def save_weights(self): torch.save(self.policy_network.state_dict(), self.env_name + "_weights.pth") def load_weights(self): self.policy_network.load_state_dict( torch.load(self.env_name + "_weights.pth")) def set_to_eval_mode(self): self.policy_network.eval()
class SAC(object): def __init__(self, num_inputs, action_space, args): self.num_inputs = num_inputs self.action_space = action_space.shape[0] self.gamma = args.gamma self.tau = args.tau self.scale_R = args.scale_R self.reparam = args.reparam self.deterministic = args.deterministic self.target_update_interval = args.target_update_interval self.policy = GaussianPolicy(self.num_inputs, self.action_space, args.hidden_size) self.policy_optim = Adam(self.policy.parameters(), lr=args.lr) self.critic = QNetwork(self.num_inputs, self.action_space, args.hidden_size) self.critic_optim = Adam(self.critic.parameters(), lr=args.lr) if self.deterministic == False: self.value = ValueNetwork(self.num_inputs, args.hidden_size) self.value_target = ValueNetwork(self.num_inputs, args.hidden_size) self.value_optim = Adam(self.value.parameters(), lr=args.lr) hard_update(self.value_target, self.value) self.value_criterion = nn.MSELoss() else: self.critic_target = QNetwork(self.num_inputs, self.action_space, args.hidden_size) hard_update(self.critic_target, self.critic) self.soft_q_criterion = nn.MSELoss() def select_action(self, state, eval=False): state = torch.FloatTensor(state).unsqueeze(0) if eval == False: self.policy.train() _, _, action, _, _ = self.policy.evaluate(state) else: self.policy.eval() _, _, _, action, _ = self.policy.evaluate(state) action = torch.tanh(action) action = action.detach().cpu().numpy() return action[0] def update_parameters(self, state_batch, action_batch, reward_batch, next_state_batch, mask_batch, updates): state_batch = torch.FloatTensor(state_batch) next_state_batch = torch.FloatTensor(next_state_batch) action_batch = torch.FloatTensor(action_batch) reward_batch = torch.FloatTensor(reward_batch) mask_batch = torch.FloatTensor(np.float32(mask_batch)) reward_batch = reward_batch.unsqueeze( 1) # reward_batch = [batch_size, 1] mask_batch = mask_batch.unsqueeze(1) # mask_batch = [batch_size, 1] """ Use two Q-functions to mitigate positive bias in the policy improvement step that is known to degrade performance of value based methods. Two Q-functions also significantly speed up training, especially on harder task. """ expected_q1_value, expected_q2_value = self.critic( state_batch, action_batch) new_action, log_prob, x_t, mean, log_std = self.policy.evaluate( state_batch, reparam=self.reparam) """ Including a separate function approximator for the soft value can stabilize training. """ expected_value = self.value(state_batch) target_value = self.value_target(next_state_batch) next_q_value = self.scale_R * reward_batch + mask_batch * self.gamma * target_value # Reward Scale * r(st,at) - γV(target)(st+1)) """ Soft Q-function parameters can be trained to minimize the soft Bellman residual JQ = 𝔼(st,at)~D[0.5(Q1(st,at) - r(st,at) - γ(𝔼st+1~p[V(st+1)]))^2] ∇JQ = ∇Q(st,at)(Q(st,at) - r(st,at) - γV(target)(st+1)) """ q1_value_loss = self.soft_q_criterion(expected_q1_value, next_q_value.detach()) q2_value_loss = self.soft_q_criterion(expected_q2_value, next_q_value.detach()) q1_new, q2_new = self.critic(state_batch, new_action) expected_new_q_value = torch.min(q1_new, q2_new) """ Including a separate function approximator for the soft value can stabilize training and is convenient to train simultaneously with the other networks Update the V towards the min of two Q-functions in order to reduce overestimation bias from function approximation error. JV = 𝔼st~D[0.5(V(st) - (𝔼at~π[Qmin(st,at) - log π(at|st)]))^2] ∇JV = ∇V(st)(V(st) - Q(st,at) + logπ(at|st)) """ next_value = expected_new_q_value - log_prob value_loss = self.value_criterion(expected_value, next_value.detach()) log_prob_target = expected_new_q_value - expected_value if self.reparam == True: """ Reparameterization trick is used to get a low variance estimator f(εt;st) = action sampled from the policy εt is an input noise vector, sampled from some fixed distribution Jπ = 𝔼st∼D,εt∼N[logπ(f(εt;st)|st)−Q(st,f(εt;st))] ∇Jπ =∇log π + ([∇at log π(at|st) − ∇at Q(st,at)])∇f(εt;st) """ policy_loss = (log_prob - expected_new_q_value).mean() else: policy_loss = (log_prob * (log_prob - log_prob_target).detach() ).mean() # likelihood ratio gradient estimator # Regularization Loss mean_loss = 0.001 * mean.pow(2).mean() std_loss = 0.001 * log_std.pow(2).mean() policy_loss += mean_loss + std_loss self.critic_optim.zero_grad() q1_value_loss.backward() self.critic_optim.step() self.critic_optim.zero_grad() q2_value_loss.backward() self.critic_optim.step() if self.deterministic == False: self.value_optim.zero_grad() value_loss.backward() self.value_optim.step() self.policy_optim.zero_grad() policy_loss.backward() self.policy_optim.step() """ We update the target weights to match the current value function weights periodically Update target parameter after every n(args.target_update_interval) updates """ if updates % self.target_update_interval == 0 and self.deterministic == True: soft_update(self.critic_target, self.critic, self.tau) return 0, q1_value_loss.item(), q2_value_loss.item( ), policy_loss.item() elif updates % self.target_update_interval == 0 and self.deterministic == False: soft_update(self.value_target, self.value, self.tau) return value_loss.item(), q1_value_loss.item(), q2_value_loss.item( ), policy_loss.item() # Save model parameters def save_model(self, env_name, suffix="", actor_path=None, critic_path=None, value_path=None): if not os.path.exists('models/'): os.makedirs('models/') if actor_path is None: actor_path = "models/sac_actor_{}_{}".format(env_name, suffix) if critic_path is None: critic_path = "models/sac_critic_{}_{}".format(env_name, suffix) if value_path is None: value_path = "models/sac_value_{}_{}".format(env_name, suffix) print('Saving models to {}, {} and {}'.format(actor_path, critic_path, value_path)) torch.save(self.value.state_dict(), value_path) torch.save(self.policy.state_dict(), actor_path) torch.save(self.critic.state_dict(), critic_path) # Load model parameters def load_model(self, actor_path, critic_path, value_path): print('Loading models from {}, {} and {}'.format( actor_path, critic_path, value_path)) if actor_path is not None: self.policy.load_state_dict(torch.load(actor_path)) if critic_path is not None: self.critic.load_state_dict(torch.load(critic_path)) if value_path is not None: self.value.load_state_dict(torch.load(value_path))