def __init__(self, args, id): self.args = args self.id = id #### Rollout Actor is a template used for MP ##### self.manager = Manager() self.rollout_actor = self.manager.list() for _ in range(args.config.num_agents): if args.ps == 'trunk': self.rollout_actor.append( MultiHeadActor(args.state_dim, args.action_dim, args.hidden_size, args.config.num_agents)) else: if args.algo_name == 'TD3': self.rollout_actor.append( Actor(args.state_dim, args.action_dim, args.hidden_size, policy_type='DeterministicPolicy')) else: self.rollout_actor.append( Actor(args.state_dim, args.action_dim, args.hidden_size, policy_type='GaussianPolicy')) if self.args.ps == 'full' or self.args.ps == 'trunk': break #Only need one for homogeneous workloads
def __init__(self, wwid, algo_name, state_dim, action_dim, actor_lr, critic_lr, gamma, tau, init_w = True): self.algo_name = algo_name; self.gamma = gamma; self.tau = tau #Initialize actors self.actor = Actor(state_dim, action_dim, wwid) if init_w: self.actor.apply(utils.init_weights) self.actor_target = Actor(state_dim, action_dim, wwid) utils.hard_update(self.actor_target, self.actor) self.actor_optim = Adam(self.actor.parameters(), actor_lr) self.critic = Critic(state_dim, action_dim) if init_w: self.critic.apply(utils.init_weights) self.critic_target = Critic(state_dim, action_dim) utils.hard_update(self.critic_target, self.critic) self.critic_optim = Adam(self.critic.parameters(), critic_lr) self.loss = nn.MSELoss() self.actor_target.cuda(); self.critic_target.cuda(); self.actor.cuda(); self.critic.cuda() self.num_critic_updates = 0 #Statistics Tracker self.action_loss = {'min':[], 'max': [], 'mean':[], 'std':[]} self.policy_loss = {'min':[], 'max': [], 'mean':[], 'std':[]} self.critic_loss = {'mean':[]} self.q = {'min':[], 'max': [], 'mean':[], 'std':[]} self.val = {'min':[], 'max': [], 'mean':[], 'std':[]}
def __init__(self, args): self.args = args self.actor = Actor(args) self.actor.apply(utils.init_weights) self.actor_target = Actor(args) self.actor_optim = Adam(self.actor.parameters(), lr=1e-4) self.critic = Critic(args) self.critic.apply(utils.init_weights) self.critic_target = Critic(args) self.critic_optim = Adam(self.critic.parameters(), lr=1e-3) self.gamma = args.gamma self.tau = self.args.tau self.loss = nn.MSELoss() self.hard_update( self.actor_target, self.actor) # Make sure target is with the same weight self.hard_update(self.critic_target, self.critic) self.actor_target.cuda() self.critic_target.cuda() self.actor.cuda() self.critic.cuda() self.num_critic_updates = 0 #Statistics Tracker self.action_loss = {'min': [], 'max': [], 'mean': [], 'std': []} self.policy_loss = {'min': [], 'max': [], 'mean': [], 'std': []} self.critic_loss = {'mean': []} self.q = {'min': [], 'max': [], 'mean': [], 'std': []} self.val = {'min': [], 'max': [], 'mean': [], 'std': []}
def __init__(self, state_dim, action_dim, gamma, tau, buffer_size, is_mem_cuda, out_act): self.actor = Actor(state_dim, action_dim, is_evo=False, out_act=out_act) self.actor_target = Actor(state_dim, action_dim, is_evo=False, out_act=out_act) self.actor_optim = Adam(self.actor.parameters(), lr=1e-4) self.critic = Critic(state_dim, action_dim) self.critic_target = Critic(state_dim, action_dim) self.critic_optim = Adam(self.critic.parameters(), lr=1e-3) self.gamma = gamma self.tau = tau self.loss = nn.MSELoss() self.replay_buffer = ReplayMemory(buffer_size, is_mem_cuda) self.exploration_noise = OUNoise(action_dim) hard_update(self.actor_target, self.actor) # Make sure target is with the same weight hard_update(self.critic_target, self.critic)
class A2C(object): def __init__(self, args): self.args = args self.actor = Actor(args) self.actor_target = Actor(args) self.actor_optim = Adam(self.actor.parameters(), lr=1e-4) self.critic = Critic(args) self.critic_target = Critic(args) self.critic_optim = Adam(self.critic.parameters(), lr=1e-3) self.gamma = args.gamma self.tau = self.args.tau self.loss = nn.MSELoss() hard_update(self.actor_target, self.actor) # Make sure target is with the same weight hard_update(self.critic_target, self.critic) def update_parameters(self, batch): state_batch = torch.cat(batch.state) next_state_batch = torch.cat(batch.next_state) action_batch = torch.cat(batch.action) reward_batch = torch.cat(batch.reward) done_batch = torch.cat(batch.done) state_batch.volatile = False next_state_batch.volatile = True action_batch.volatile = False # Critic Update vals = self.critic.forward(state_batch) new_vals = self.critic.forward(next_state_batch) * (1 - done_batch) targets = reward_batch + self.gamma * new_vals self.critic_optim.zero_grad() dt = self.loss(vals, targets) dt.backward() self.critic_optim.step() # Actor Update self.actor_optim.zero_grad() state_batch = utils.to_tensor(utils.to_numpy(state_batch)) targets = utils.to_tensor(utils.to_numpy(targets)) vals = utils.to_tensor(utils.to_numpy(vals)) action_logs = self.actor.forward(state_batch) entropy_loss = torch.mean(entropy(torch.exp(action_logs))) action_logs = F.log_softmax(action_logs) dt = targets - vals alogs = [] for i, action in enumerate(action_batch): action_i = int(action.cpu().data.numpy()) alogs.append(action_logs[i, action_i]) alogs = torch.cat(alogs).unsqueeze(0) policy_loss = -torch.mean(dt * alogs.t()) actor_loss = policy_loss - entropy_loss actor_loss.backward() self.actor_optim.step()
def __init__( self, CERL_agent, num_workers, trainers, pomdp_adv=False ): #trainers first is the blue agent and second is the red model self.num_workers = num_workers self.trainers = trainers self.pomdp_adv = pomdp_adv self.args = CERL_agent.args self.drqn = CERL_agent.args.drqn #denote if blue uses drqn if self.pomdp_adv: self.trainers = [trainers[0], None] #make sure the red model is never used self.buffer_gpu = CERL_agent.args.buffer_gpu self.batch_size = CERL_agent.args.batch_size self.algo = CERL_agent.args.algo self.state_dim = CERL_agent.args.state_dim self.action_dim = CERL_agent.args.action_dim self.buffer = Buffer(BUFFER_SIZE, self.buffer_gpu) #initialize own replay buffer self.data_bucket = self.buffer.tuples self.evo_task_pipes = [Pipe() for _ in range(self.num_workers)] self.evo_result_pipes = [Pipe() for _ in range(self.num_workers)] self.actual_red_worker = Actor( CERL_agent.args.state_dim, CERL_agent.args.action_dim, -1, 'dis') #this model is shared accross the workers self.actual_red_worker.share_memory() self.td3args = { 'policy_noise': 0.2, 'policy_noise_clip': 0.5, 'policy_ups_freq': 2, 'action_low': CERL_agent.args.action_low, 'action_high': CERL_agent.args.action_high, 'cerl_args': self.args } self.renew_learner( ) #now we are not using new learner for each iteration self.rollout_bucket = [ self.actual_red_worker for i in range(num_workers) ] self.workers = [ Process(target=rollout_worker, args=(id, 3, self.evo_task_pipes[id][1], self.evo_result_pipes[id][0], False, self.data_bucket, self.rollout_bucket, 'dummy_name', None, 'dis', self.trainers, False, self.pomdp_adv)) for id in range(num_workers) ] for worker in self.workers: worker.start() self.evo_flag = [True for _ in range(self.num_workers)]
def __init__(self, id, num_inputs, action_dim, hidden_size, gamma, critic_lr, actor_lr, tau, alpha, target_update_interval, savetag, foldername, actualize, use_gpu): self.num_inputs = num_inputs self.action_space = action_dim self.gamma = gamma self.tau = 0.005 self.alpha = 0.2 self.policy_type = "Gaussian" self.target_update_interval = 1 self.tracker = utils.Tracker(foldername, ['q_'+savetag, 'qloss_'+savetag, 'value_'+savetag, 'value_loss_'+savetag, 'policy_loss_'+savetag, 'mean_loss_'+savetag, 'std_loss_'+savetag], '.csv',save_iteration=1000, conv_size=1000) self.total_update = 0 self.agent_id = id self.actualize = actualize self.critic = QNetwork(self.num_inputs, self.action_space, hidden_size) self.critic_optim = Adam(self.critic.parameters(), lr=critic_lr) self.soft_q_criterion = nn.MSELoss() if self.policy_type == "Gaussian": self.policy = Actor(self.num_inputs, self.action_space, hidden_size, policy_type='GaussianPolicy') self.policy_optim = Adam(self.policy.parameters(), lr=actor_lr) self.value = ValueNetwork(self.num_inputs, hidden_size) self.value_target = ValueNetwork(self.num_inputs, hidden_size) self.value_optim = Adam(self.value.parameters(), lr=critic_lr) utils.hard_update(self.value_target, self.value) self.value_criterion = nn.MSELoss() else: self.policy = Actor(self.num_inputs, self.action_space, hidden_size, policy_type='DeterministicPolicy') self.policy_optim = Adam(self.policy.parameters(), lr=actor_lr) self.critic_target = QNetwork(self.num_inputs, self.action_space, hidden_size) utils.hard_update(self.critic_target, self.critic) self.policy.cuda() self.value.cuda() self.value_target.cuda() self.critic.cuda() #Statistics Tracker self.q = {'min':None, 'max': None, 'mean':None, 'std':None} self.val = {'min':None, 'max': None, 'mean':None, 'std':None} self.value_loss = {'min':None, 'max': None, 'mean':None, 'std':None} self.policy_loss = {'min':None, 'max': None, 'mean':None, 'std':None} self.mean_loss = {'min':None, 'max': None, 'mean':None, 'std':None} self.std_loss = {'min':None, 'max': None, 'mean':None, 'std':None} self.q_loss = {'min':None, 'max': None, 'mean':None, 'std':None}
def __init__(self, args): self.args = args self.evolver = SSNE(self.args) #MP TOOLS self.manager = Manager() #Init population self.pop = self.manager.list() for _ in range(args.pop_size): self.pop.append(Actor(args)) #self.pop[-1].apply(utils.init_weights) self.best_policy = Actor(args) #Turn off gradients and put in eval mode for actor in self.pop: actor = actor.cpu() actor.eval() if SEED_POP: self.load_seed(args.model_save, self.pop) #Init BUFFER self.replay_buffer = Buffer(100000, self.args.data_folder) #MP TOOLS self.exp_list = self.manager.list() self.evo_task_pipes = [Pipe() for _ in range(args.pop_size)] self.evo_result_pipes = [Pipe() for _ in range(args.pop_size)] self.evo_workers = [ Process(target=rollout_worker, args=(i, self.evo_task_pipes[i][1], self.evo_result_pipes[i][1], None, self.exp_list, self.pop, DIFFICULTY, USE_RS, True, USE_SYNTHETIC_TARGET, XBIAS, ZBIAS, PHASE_LEN, None, EP_LEN, JGS)) for i in range(args.pop_size) ] for worker in self.evo_workers: worker.start() #Trackers self.buffer_added = 0 self.best_score = 0.0 self.frames_seen = 0.0 self.best_shaped_score = None self.eval_flag = [True for _ in range(args.pop_size)]
def create_actor(name="ModelName", height=185, sex=1): actor = Actor() actor.name = name actor.height = height actor.sex = sex actor.save() return actor
def __init__(self, args): self.args = args self.actor = Actor(args, init=True) self.actor_target = Actor(args, init=True) self.actor_optim = Adam(self.actor.parameters(), lr=0.5e-4) self.critic = Critic(args) self.critic_target = Critic(args) self.critic_optim = Adam(self.critic.parameters(), lr=0.5e-3) self.gamma = args.gamma self.tau = self.args.tau self.loss = nn.MSELoss() hard_update(self.actor_target, self.actor) # Make sure target is with the same weight hard_update(self.critic_target, self.critic)
def __init__(self, sample_budget): self.sample_budget = sample_budget dummy_args = Parameters() #Load all Critics critic_template = Critic(dummy_args) self.critic_ensemble = utils.load_all_models_dir( CRITIC_DIR, critic_template) #Load all Actors actor_template = Actor(dummy_args) self.actor_ensemble = utils.load_all_models_dir( ACTOR_DIR, actor_template)
def __init__(self, id, algo_name, state_dim, action_dim, hidden_size, actor_lr, critic_lr, gamma, tau, savetag, foldername, actualize, use_gpu, init_w = True): self.algo_name = algo_name; self.gamma = gamma; self.tau = tau; self.total_update = 0; self.agent_id = id; self.actualize = actualize; self.use_gpu = use_gpu self.tracker = utils.Tracker(foldername, ['q_'+savetag, 'qloss_'+savetag, 'policy_loss_'+savetag, 'alz_score'+savetag,'alz_policy'+savetag], '.csv', save_iteration=1000, conv_size=1000) #Initialize actors self.policy = Actor(state_dim, action_dim, hidden_size, policy_type='DeterministicPolicy') if init_w: self.policy.apply(utils.init_weights) self.policy_target = Actor(state_dim, action_dim, hidden_size, policy_type='DeterministicPolicy') utils.hard_update(self.policy_target, self.policy) self.policy_optim = Adam(self.policy.parameters(), actor_lr) self.critic = QNetwork(state_dim, action_dim,hidden_size) if init_w: self.critic.apply(utils.init_weights) self.critic_target = QNetwork(state_dim, action_dim, hidden_size) utils.hard_update(self.critic_target, self.critic) self.critic_optim = Adam(self.critic.parameters(), critic_lr) if actualize: self.ANetwork = ActualizationNetwork(state_dim, action_dim, hidden_size) if init_w: self.ANetwork.apply(utils.init_weights) self.actualize_optim = Adam(self.ANetwork.parameters(), critic_lr) self.actualize_lr = 0.2 if use_gpu: self.ANetwork.cuda() self.loss = nn.MSELoss() if use_gpu: self.policy_target.cuda(); self.critic_target.cuda(); self.policy.cuda(); self.critic.cuda() self.num_critic_updates = 0 #Statistics Tracker #self.action_loss = {'min':None, 'max': None, 'mean':None, 'std':None} self.policy_loss = {'min':None, 'max': None, 'mean':None, 'std':None} self.q_loss = {'min':None, 'max': None, 'mean':None, 'std':None} self.q = {'min':None, 'max': None, 'mean':None, 'std':None} self.alz_score = {'min':None, 'max': None, 'mean':None, 'std':None} self.alz_policy = {'min':None, 'max': None, 'mean':None, 'std':None}
def __init__(self, args): self.args = args self.actor = Actor(args) if args.init_w: self.actor.apply(utils.init_weights) self.actor_target = Actor(args) self.optim = Adam(self.actor.parameters(), lr=5e-4) self.vfunc = ValueFunc(args) if args.init_w: self.vfunc.apply(utils.init_weights) self.gamma = args.gamma self.loss = nn.SmoothL1Loss() #nn.MSELoss() #self.actor.cuda(); self.vfunc.cuda() self.num_critic_updates = 0 #Statistics Tracker self.action_loss = {'min': [], 'max': [], 'mean': [], 'std': []} self.policy_loss = {'min': [], 'max': [], 'mean': [], 'std': []} self.critic_loss = {'mean': []} self.q = {'min': [], 'max': [], 'mean': [], 'std': []} self.val = {'min': [], 'max': [], 'mean': [], 'std': []}
def __init__(self, n_agents, dim_obs, dim_act, batch_size, capacity, episodes_before_train): self.actors = [Actor(dim_obs, dim_act) for i in range(n_agents)] self.critics = [Critic(n_agents, dim_obs, dim_act) for i in range(n_agents)] self.actors_target = deepcopy(self.actors) self.critics_target = deepcopy(self.critics) self.n_agents = n_agents self.n_states = dim_obs self.n_actions = dim_act self.memory = ReplayMemory(capacity) self.batch_size = batch_size self.use_cuda = th.cuda.is_available() self.episodes_before_train = episodes_before_train self.GAMMA = 0.5 self.tau = 0.0001 self.var = [1.0 for i in range(n_agents)] self.critic_optimizer = [Adam(x.parameters(), lr=0.00005) for x in self.critics] self.actor_optimizer = [Adam(x.parameters(), lr=0.00005) for x in self.actors] if self.use_cuda: for x in self.actors: x.cuda() for x in self.critics: x.cuda() for x in self.actors_target: x.cuda() for x in self.critics_target: x.cuda() self.steps_done = 0 self.episode_done = 0
class Off_Policy_Algo(object): """Classes implementing TD3 and DDPG off-policy learners Parameters: args (object): Parameter class """ def __init__(self, wwid, algo_name, state_dim, action_dim, actor_lr, critic_lr, gamma, tau, init_w=True): self.algo_name = algo_name self.gamma = gamma self.tau = tau self.HLoss = HLoss() #Initialize actors self.actor = Actor(state_dim, action_dim, wwid, self.algo_name) if init_w: self.actor.apply(utils.init_weights) self.actor_target = Actor(state_dim, action_dim, wwid, self.algo_name) utils.hard_update(self.actor_target, self.actor) self.actor_optim = Adam(self.actor.parameters(), actor_lr) self.critic = Critic(state_dim, action_dim) if init_w: self.critic.apply(utils.init_weights) self.critic_target = Critic(state_dim, action_dim) utils.hard_update(self.critic_target, self.critic) self.critic_optim = Adam(self.critic.parameters(), critic_lr) self.loss = nn.MSELoss() if torch.cuda.is_available(): self.actor_target.cuda() self.critic_target.cuda() self.actor.cuda() self.critic.cuda() self.num_critic_updates = 0 #Statistics Tracker self.action_loss = {'min': [], 'max': [], 'mean': [], 'std': []} self.policy_loss = {'min': [], 'max': [], 'mean': [], 'std': []} self.critic_loss = {'mean': []} self.q = {'min': [], 'max': [], 'mean': [], 'std': []} self.val = {'min': [], 'max': [], 'mean': [], 'std': []} def save_net(self, path): torch.save(self.actor.state_dict(), path) def act(self, state): return self.actor(state) def share_memory(self): self.actor.share_memory() self.actor_target.share_memory() self.critic.share_memory() self.critic_target.share_memory() def compute_stats(self, tensor, tracker): """Computes stats from intermediate tensors Parameters: tensor (tensor): tensor tracker (object): logger Returns: None """ tracker['min'].append(torch.min(tensor).item()) tracker['max'].append(torch.max(tensor).item()) tracker['mean'].append(torch.mean(tensor).item()) tracker['mean'].append(torch.mean(tensor).item()) def update_parameters(self, state_batch, next_state_batch, action_batch, reward_batch, done_batch, num_epoch=1, **kwargs): """Runs a step of Bellman upodate and policy gradient using a batch of experiences Parameters: state_batch (tensor): Current States next_state_batch (tensor): Next States action_batch (tensor): Actions reward_batch (tensor): Rewards done_batch (tensor): Done batch num_epoch (int): Number of learning iteration to run with the same data Returns: None """ if isinstance(state_batch, list): state_batch = torch.cat(state_batch) next_state_batch = torch.cat(next_state_batch) action_batch = torch.cat(action_batch) reward_batch = torch.cat(reward_batch).done_batch = torch.cat( done_batch) for _ in range(num_epoch): ########### CRITIC UPDATE #################### #Compute next q-val, next_v and target with torch.no_grad(): #Policy Noise policy_noise = np.random.normal( 0, kwargs['policy_noise'], (action_batch.size()[0], action_batch.size()[1])) policy_noise = torch.clamp(torch.Tensor(policy_noise), -kwargs['policy_noise_clip'], kwargs['policy_noise_clip']) #Compute next action_bacth #next_action_batch = self.actor_target.turn_max_into_onehot(self.actor_target.Gumbel_softmax_sample_distribution(next_state_batch, use_cuda=True))\ # if self.algo_name == 'dis' else self.actor_target.forward(next_state_batch) + policy_noise.cuda() #this should use one-hot from logits next_action_batch = self.actor_target.turn_max_into_onehot(self.actor_target.forward(next_state_batch)) \ if self.algo_name == 'dis' else self.actor_target.forward(next_state_batch) + policy_noise.cuda() # this should use one-hot from logits if random.random() < 0.0001: print('off_policy line 114, changed next action batch') next_action_batch = torch.clamp(next_action_batch, 0, 1) #Compute Q-val and value of next state masking by done q1, q2, _ = self.critic_target.forward(next_state_batch, next_action_batch) q1 = (1 - done_batch) * q1 q2 = (1 - done_batch) * q2 #Select which q to use as next-q (depends on algo) if self.algo_name == 'TD3' or self.algo_name == 'TD3_actor_min' or self.algo_name == 'dis': next_q = torch.min(q1, q2) elif self.algo_name == 'DDPG': next_q = q1 elif self.algo_name == 'TD3_max': next_q = torch.max(q1, q2) #Compute target q and target val target_q = reward_batch + (self.gamma * next_q) self.critic_optim.zero_grad() current_q1, current_q2, current_val = self.critic.forward( (state_batch), (action_batch )) #here the action batch should be the soft version self.compute_stats(current_q1, self.q) dt = self.loss(current_q1, target_q) if self.algo_name == 'TD3' or self.algo_name == 'TD3_max' or self.algo_name == 'dis': dt = dt + self.loss(current_q2, target_q) self.critic_loss['mean'].append(dt.item()) #print(dt.item(), "off_policy_algo line 136") dt.backward() self.critic_optim.step() self.num_critic_updates += 1 #Delayed Actor Update if self.num_critic_updates % kwargs['policy_ups_freq'] == 0: actor_actions = self.actor.Gumbel_softmax_sample_distribution(state_batch, use_cuda=True)\ if self.algo_name == 'dis' else self.actor.forward(state_batch) #actor_actions = self.actor.forward(state_batch) #if random.random() < 0.001: print('actor action changed') Q1, Q2, val = self.critic.forward(state_batch, actor_actions) # if self.args.use_advantage: policy_loss = -(Q1 - val) policy_loss = -Q1 + 0.1 * self.HLoss( actor_actions ) # HLoss is a single scalar, directly regularized logits? if random.random() < 0.0005: print('added entropy regularization, off_policy_algo 161') self.compute_stats(policy_loss, self.policy_loss) policy_loss = policy_loss.mean() #print(policy_loss, 'off_policy line 157') self.actor_optim.zero_grad() policy_loss.backward(retain_graph=True) self.actor_optim.step() #if random.random() <= 0.001: # self.test_actor_gradient_descent(state_batch) if self.num_critic_updates % kwargs['policy_ups_freq'] == 0: utils.soft_update(self.actor_target, self.actor, self.tau) utils.soft_update(self.critic_target, self.critic, self.tau) def test_actor_gradient_descent(self, state_batch): #this method test if running gradient descent on the actor actually decrease the loss print("test_actor_gradient_descent, off_policy_algo line 179") for i in range(10): actor_actions = self.actor.forward(state_batch) print("logits_", self.actor.w_out(self.actor.logits(state_batch))[0]) print("action_batch", actor_actions[0]) Q1, Q2, val = self.critic.forward(state_batch, actor_actions) policy_loss = -Q1 policy_loss = policy_loss.mean() print("policy_loss at i = ", i, " is ", policy_loss) self.actor_optim.zero_grad() policy_loss.backward(retain_graph=True) print("gradient_", self.actor.f1.bias.grad[0]) self.actor_optim.step() print("bias_", self.actor.f1.bias[0])
class PPO(object): """Classes implementing TD3 and DDPG off-policy learners Parameters: args (object): Parameter class """ def __init__(self, args): self.args = args self.actor = Actor(args) if args.init_w: self.actor.apply(utils.init_weights) self.actor_target = Actor(args) self.optim = Adam(self.actor.parameters(), lr=5e-4) self.vfunc = ValueFunc(args) if args.init_w: self.vfunc.apply(utils.init_weights) self.gamma = args.gamma self.loss = nn.SmoothL1Loss() #nn.MSELoss() #self.actor.cuda(); self.vfunc.cuda() self.num_critic_updates = 0 #Statistics Tracker self.action_loss = {'min': [], 'max': [], 'mean': [], 'std': []} self.policy_loss = {'min': [], 'max': [], 'mean': [], 'std': []} self.critic_loss = {'mean': []} self.q = {'min': [], 'max': [], 'mean': [], 'std': []} self.val = {'min': [], 'max': [], 'mean': [], 'std': []} def compute_gae(self, trajectory, gamma=0.99, tau=0.95): with torch.no_grad(): values = [] next_values = [] rewards = [] masks = [] states = [] actions = [] for entry in trajectory: states.append(torch.tensor(entry[0])) actions.append(torch.tensor(entry[1])) values.append(self.vfunc(torch.Tensor(entry[0]))) rewards.append(torch.Tensor(entry[3])) masks.append(torch.Tensor(entry[5])) values.append(self.vfunc(torch.Tensor(entry[2]))) gae = 0.0 returns = [] for step in reversed(range(len(rewards))): delta = rewards[step] + gamma * values[ step + 1] * masks[step] - values[step] gae = delta + gamma * tau * masks[step] * gae returns.insert(0, gae + values[step]) return states, actions, values, returns def compute_stats(self, tensor, tracker): """Computes stats from intermediate tensors Parameters: tensor (tensor): tensor tracker (object): logger Returns: None """ tracker['min'].append(torch.min(tensor).item()) tracker['max'].append(torch.max(tensor).item()) tracker['mean'].append(torch.mean(tensor).item()) tracker['mean'].append(torch.mean(tensor).item()) def update_parameters(self, states, actions, log_probs, returns, advantages, ppo_epochs=8, mini_batch_size=128, clip_param=0.2): """Runs a step of Bellman upodate and policy gradient using a batch of experiences Parameters: state_batch (tensor): Current States next_state_batch (tensor): Next States action_batch (tensor): Actions reward_batch (tensor): Rewards done_batch (tensor): Done batch num_epoch (int): Number of learning iteration to run with the same data Returns: None """ for _ in range(ppo_epochs): ind = random.sample(range(len(states)), mini_batch_size) mini_s = states[ind] mini_a = actions[ind] mini_ret = returns[ind] mini_adv = advantages[ind] #PPO Update new_action, value = self.actor(mini_s), self.vfunc(mini_s) ratio = mini_a - new_action surr1 = ratio * mini_adv surr2 = torch.clamp(ratio, 1.0 - clip_param, 1.0 + clip_param) * mini_adv actor_loss = -torch.min(surr1, surr2).mean() critic_loss = (mini_ret - value).pow(2).mean() loss = 0.5 * critic_loss + actor_loss self.optim.zero_grad() loss.backward() self.optim.step() def soft_update(self, target, source, tau): """Soft update from target network to source Parameters: target (object): A pytorch model source (object): A pytorch model tau (float): Tau parameter Returns: None """ for target_param, param in zip(target.parameters(), source.parameters()): target_param.data.copy_(target_param.data * (1.0 - tau) + param.data * tau) def hard_update(self, target, source): """Hard update (clone) from target network to source Parameters: target (object): A pytorch model source (object): A pytorch model Returns: None """ for target_param, param in zip(target.parameters(), source.parameters()): target_param.data.copy_(param.data)
class TD3_DDPG(object): """Classes implementing TD3 and DDPG off-policy learners Parameters: args (object): Parameter class """ def __init__(self, args): self.args = args self.algo = args.algo self.actor = Actor(args) if args.init_w: self.actor.apply(utils.init_weights) self.actor_target = Actor(args) self.actor_optim = Adam(self.actor.parameters(), lr=5e-5) self.critic = Critic(args) if args.init_w: self.critic.apply(utils.init_weights) self.critic_target = Critic(args) self.critic_optim = Adam(self.critic.parameters(), lr=5e-4) self.gamma = args.gamma self.tau = self.args.tau self.loss = nn.MSELoss() self.hard_update( self.actor_target, self.actor) # Make sure target is with the same weight self.hard_update(self.critic_target, self.critic) self.actor_target.cuda() self.critic_target.cuda() self.actor.cuda() self.critic.cuda() self.num_critic_updates = 0 #Statistics Tracker self.action_loss = {'min': [], 'max': [], 'mean': [], 'std': []} self.policy_loss = {'min': [], 'max': [], 'mean': [], 'std': []} self.critic_loss = {'mean': []} self.q = {'min': [], 'max': [], 'mean': [], 'std': []} self.val = {'min': [], 'max': [], 'mean': [], 'std': []} def compute_stats(self, tensor, tracker): """Computes stats from intermediate tensors Parameters: tensor (tensor): tensor tracker (object): logger Returns: None """ tracker['min'].append(torch.min(tensor).item()) tracker['max'].append(torch.max(tensor).item()) tracker['mean'].append(torch.mean(tensor).item()) tracker['mean'].append(torch.mean(tensor).item()) def update_parameters(self, state_batch, next_state_batch, action_batch, reward_batch, done_batch, num_epoch=1): """Runs a step of Bellman upodate and policy gradient using a batch of experiences Parameters: state_batch (tensor): Current States next_state_batch (tensor): Next States action_batch (tensor): Actions reward_batch (tensor): Rewards done_batch (tensor): Done batch num_epoch (int): Number of learning iteration to run with the same data Returns: None """ if isinstance(state_batch, list): state_batch = torch.cat(state_batch) next_state_batch = torch.cat(next_state_batch) action_batch = torch.cat(action_batch) reward_batch = torch.cat(reward_batch).done_batch = torch.cat( done_batch) for _ in range(num_epoch): ########### CRITIC UPDATE #################### #Compute next q-val, next_v and target with torch.no_grad(): #Policy Noise policy_noise = np.random.normal( 0, self.args.policy_noise, (action_batch.size()[0], action_batch.size()[1])) policy_noise = torch.clamp(torch.Tensor(policy_noise), -self.args.policy_noise_clip, self.args.policy_noise_clip) #Compute next action_bacth next_action_batch = self.actor_target.forward( next_state_batch) + policy_noise.cuda() next_action_batch = torch.clamp(next_action_batch, 0, 1) #Compute Q-val and value of next state masking by done q1, q2, next_val = self.critic_target.forward( next_state_batch, next_action_batch) if self.args.use_done_mask: q1 = (1 - done_batch) * q1 q2 = (1 - done_batch) * q2 next_val = (1 - done_batch) * next_val #Clamp Q-vals if self.args.q_clamp != None: q1 = torch.clamp(q1, -self.args.q_clamp, self.args.q_clamp) q1 = torch.clamp(q2, -self.args.q_clamp, self.args.q_clamp) #Select which q to use as next-q (depends on algo) if self.algo == 'TD3' or self.algo == 'TD3_actor_min': next_q = torch.min(q1, q2) elif self.algo == 'DDPG': next_q = q1 elif self.algo == 'TD3_max': next_q = torch.max(q1, q2) #Compute target q and target val target_q = reward_batch + (self.gamma * next_q) if self.args.use_advantage: target_val = reward_batch + (self.gamma * next_val) self.critic_optim.zero_grad() current_q1, current_q2, current_val = self.critic.forward( (state_batch), (action_batch)) self.compute_stats(current_q1, self.q) dt = self.loss(current_q1, target_q) if self.args.use_advantage: dt = dt + self.loss(current_val, target_val) self.compute_stats(current_val, self.val) if self.algo == 'TD3' or self.algo == 'TD3_max': dt = dt + self.loss(current_q2, target_q) self.critic_loss['mean'].append(dt.item()) if self.args.critic_constraint: if dt.item() > self.args.critic_constraint_w: dt = dt * (abs(self.args.critic_constraint_w / dt.item())) dt.backward() self.critic_optim.step() self.num_critic_updates += 1 #Delayed Actor Update if self.num_critic_updates % self.args.policy_ups_freq == 0: actor_actions = self.actor.forward(state_batch) # Trust Region constraint if self.args.trust_region_actor: with torch.no_grad(): old_actor_actions = self.actor_target.forward( state_batch) actor_actions = action_batch - old_actor_actions Q1, Q2, val = self.critic.forward(state_batch, actor_actions) if self.args.use_advantage: policy_loss = -(Q1 - val) else: policy_loss = -Q1 self.compute_stats(policy_loss, self.policy_loss) policy_loss = policy_loss.mean() self.actor_optim.zero_grad() policy_loss.backward(retain_graph=True) #nn.utils.clip_grad_norm_(self.actor.parameters(), 10) if self.args.action_loss: action_loss = torch.abs(actor_actions - 0.5) self.compute_stats(action_loss, self.action_loss) action_loss = action_loss.mean() * self.args.action_loss_w action_loss.backward() #if self.action_loss[-1] > self.policy_loss[-1]: self.args.action_loss_w *= 0.9 #Decay action_w loss if action loss is larger than policy gradient loss self.actor_optim.step() if self.args.hard_update: if self.num_critic_updates % self.args.hard_update_freq == 0: if self.num_critic_updates % self.args.policy_ups_freq == 0: self.hard_update(self.actor_target, self.actor) self.hard_update(self.critic_target, self.critic) else: if self.num_critic_updates % self.args.policy_ups_freq == 0: self.soft_update(self.actor_target, self.actor, self.tau) self.soft_update(self.critic_target, self.critic, self.tau) def soft_update(self, target, source, tau): """Soft update from target network to source Parameters: target (object): A pytorch model source (object): A pytorch model tau (float): Tau parameter Returns: None """ for target_param, param in zip(target.parameters(), source.parameters()): target_param.data.copy_(target_param.data * (1.0 - tau) + param.data * tau) def hard_update(self, target, source): """Hard update (clone) from target network to source Parameters: target (object): A pytorch model source (object): A pytorch model Returns: None """ for target_param, param in zip(target.parameters(), source.parameters()): target_param.data.copy_(param.data)
class Actor_Critic(object): def __init__(self, state_dim, action_dim, gamma, tau, buffer_size, is_mem_cuda, out_act): self.actor = Actor(state_dim, action_dim, is_evo=False, out_act=out_act) self.actor_target = Actor(state_dim, action_dim, is_evo=False, out_act=out_act) self.actor_optim = Adam(self.actor.parameters(), lr=1e-4) self.critic = Critic(state_dim, action_dim) self.critic_target = Critic(state_dim, action_dim) self.critic_optim = Adam(self.critic.parameters(), lr=1e-3) self.gamma = gamma self.tau = tau self.loss = nn.MSELoss() self.replay_buffer = ReplayMemory(buffer_size, is_mem_cuda) self.exploration_noise = OUNoise(action_dim) hard_update(self.actor_target, self.actor) # Make sure target is with the same weight hard_update(self.critic_target, self.critic) def act(self, state, is_noise): state = utils.to_tensor(state).unsqueeze(0) action = self.actor.forward(state) action = action.detach().numpy().flatten() if is_noise: action += self.exploration_noise.noise() return action def train_from_batch(self, batch): env_state_batch = torch.cat(batch.state) goal_batch = torch.cat(batch.goal) uvfa_states = torch.cat((env_state_batch, goal_batch), dim=1).detach() next_env_state_batch = torch.cat(batch.next_state) next_uvfa_states = torch.cat((next_env_state_batch, goal_batch), dim=1).detach() action_batch = torch.cat(batch.action).detach() reward_batch = torch.cat(batch.reward).detach() #if self.args.use_done_mask: done_batch = torch.cat(batch.done) #Load everything to GPU if not already # if self.args.is_memory_cuda and not self.args.is_cuda: self.actor.cuda() self.actor_target.cuda() self.critic_target.cuda() self.critic.cuda() uvfa_states = uvfa_states.cuda() next_uvfa_states = next_uvfa_states.cuda() action_batch = action_batch.cuda() reward_batch = reward_batch.cuda() # if self.args.use_done_mask: done_batch = done_batch.cuda() #Critic Update with torch.no_grad(): next_action_batch = self.actor_target.forward(next_uvfa_states) next_q = self.critic_target.forward(next_uvfa_states, next_action_batch) #if self.args.use_done_mask: next_q = next_q * ( 1 - done_batch.float()) #Done mask target_q = reward_batch + (self.gamma * next_q) self.critic_optim.zero_grad() current_q = self.critic.forward((uvfa_states.detach()), (action_batch.detach())) dt = self.loss(current_q, target_q) dt.backward() nn.utils.clip_grad_norm_(self.critic.parameters(), 10) self.critic_optim.step() #Actor Update self.actor_optim.zero_grad() policy_loss = -self.critic.forward( (uvfa_states), self.actor.forward((uvfa_states))) policy_loss = policy_loss.mean() policy_loss.backward() nn.utils.clip_grad_norm_(self.critic.parameters(), 10) self.actor_optim.step() soft_update(self.actor_target, self.actor, self.tau) soft_update(self.critic_target, self.critic, self.tau) #Nets back to CPU if using memory_cuda self.actor.cpu() self.actor_target.cpu() self.critic_target.cpu() self.critic.cpu()
class Off_Policy_Algo(object): """Classes implementing TD3 and DDPG off-policy learners Parameters: args (object): Parameter class """ def __init__(self, wwid, algo_name, state_dim, action_dim, actor_lr, critic_lr, gamma, tau, init_w = True): self.algo_name = algo_name; self.gamma = gamma; self.tau = tau #Initialize actors self.actor = Actor(state_dim, action_dim, wwid) if init_w: self.actor.apply(utils.init_weights) self.actor_target = Actor(state_dim, action_dim, wwid) utils.hard_update(self.actor_target, self.actor) self.actor_optim = Adam(self.actor.parameters(), actor_lr) self.critic = Critic(state_dim, action_dim) if init_w: self.critic.apply(utils.init_weights) self.critic_target = Critic(state_dim, action_dim) utils.hard_update(self.critic_target, self.critic) self.critic_optim = Adam(self.critic.parameters(), critic_lr) self.loss = nn.MSELoss() self.actor_target.cuda(); self.critic_target.cuda(); self.actor.cuda(); self.critic.cuda() self.num_critic_updates = 0 #Statistics Tracker self.action_loss = {'min':[], 'max': [], 'mean':[], 'std':[]} self.policy_loss = {'min':[], 'max': [], 'mean':[], 'std':[]} self.critic_loss = {'mean':[]} self.q = {'min':[], 'max': [], 'mean':[], 'std':[]} self.val = {'min':[], 'max': [], 'mean':[], 'std':[]} def compute_stats(self, tensor, tracker): """Computes stats from intermediate tensors Parameters: tensor (tensor): tensor tracker (object): logger Returns: None """ tracker['min'].append(torch.min(tensor).item()) tracker['max'].append(torch.max(tensor).item()) tracker['mean'].append(torch.mean(tensor).item()) tracker['mean'].append(torch.mean(tensor).item()) def update_parameters(self, state_batch, next_state_batch, action_batch, reward_batch, done_batch, num_epoch=1, **kwargs): """Runs a step of Bellman upodate and policy gradient using a batch of experiences Parameters: state_batch (tensor): Current States next_state_batch (tensor): Next States action_batch (tensor): Actions reward_batch (tensor): Rewards done_batch (tensor): Done batch num_epoch (int): Number of learning iteration to run with the same data Returns: None """ if isinstance(state_batch, list): state_batch = torch.cat(state_batch); next_state_batch = torch.cat(next_state_batch); action_batch = torch.cat(action_batch); reward_batch = torch.cat(reward_batch). done_batch = torch.cat(done_batch) for _ in range(num_epoch): ########### CRITIC UPDATE #################### #Compute next q-val, next_v and target with torch.no_grad(): #Policy Noise policy_noise = np.random.normal(0, kwargs['policy_noise'], (action_batch.size()[0], action_batch.size()[1])) policy_noise = torch.clamp(torch.Tensor(policy_noise), -kwargs['policy_noise_clip'], kwargs['policy_noise_clip']) #Compute next action_bacth next_action_batch = self.actor_target.forward(next_state_batch) + policy_noise.cuda() next_action_batch = torch.clamp(next_action_batch, 0,1) #Compute Q-val and value of next state masking by done q1, q2, _ = self.critic_target.forward(next_state_batch, next_action_batch) q1 = (1 - done_batch) * q1 q2 = (1 - done_batch) * q2 #Select which q to use as next-q (depends on algo) if self.algo_name == 'TD3' or self.algo_name == 'TD3_actor_min': next_q = torch.min(q1, q2) elif self.algo_name == 'DDPG': next_q = q1 elif self.algo_name == 'TD3_max': next_q = torch.max(q1, q2) #Compute target q and target val target_q = reward_batch + (self.gamma * next_q) self.critic_optim.zero_grad() current_q1, current_q2, current_val = self.critic.forward((state_batch), (action_batch)) self.compute_stats(current_q1, self.q) dt = self.loss(current_q1, target_q) if self.algo_name == 'TD3' or self.algo_name == 'TD3_max': dt = dt + self.loss(current_q2, target_q) self.critic_loss['mean'].append(dt.item()) dt.backward() self.critic_optim.step() self.num_critic_updates += 1 #Delayed Actor Update if self.num_critic_updates % kwargs['policy_ups_freq'] == 0: actor_actions = self.actor.forward(state_batch) Q1, Q2, val = self.critic.forward(state_batch, actor_actions) # if self.args.use_advantage: policy_loss = -(Q1 - val) policy_loss = -Q1 self.compute_stats(policy_loss,self.policy_loss) policy_loss = policy_loss.mean() self.actor_optim.zero_grad() policy_loss.backward(retain_graph=True) self.actor_optim.step() if self.num_critic_updates % kwargs['policy_ups_freq'] == 0: utils.soft_update(self.actor_target, self.actor, self.tau) utils.soft_update(self.critic_target, self.critic, self.tau)
class SAC(object): def __init__(self, id, num_inputs, action_dim, hidden_size, gamma, critic_lr, actor_lr, tau, alpha, target_update_interval, savetag, foldername, actualize, use_gpu): self.num_inputs = num_inputs self.action_space = action_dim self.gamma = gamma self.tau = 0.005 self.alpha = 0.2 self.policy_type = "Gaussian" self.target_update_interval = 1 self.tracker = utils.Tracker(foldername, ['q_'+savetag, 'qloss_'+savetag, 'value_'+savetag, 'value_loss_'+savetag, 'policy_loss_'+savetag, 'mean_loss_'+savetag, 'std_loss_'+savetag], '.csv',save_iteration=1000, conv_size=1000) self.total_update = 0 self.agent_id = id self.actualize = actualize self.critic = QNetwork(self.num_inputs, self.action_space, hidden_size) self.critic_optim = Adam(self.critic.parameters(), lr=critic_lr) self.soft_q_criterion = nn.MSELoss() if self.policy_type == "Gaussian": self.policy = Actor(self.num_inputs, self.action_space, hidden_size, policy_type='GaussianPolicy') self.policy_optim = Adam(self.policy.parameters(), lr=actor_lr) self.value = ValueNetwork(self.num_inputs, hidden_size) self.value_target = ValueNetwork(self.num_inputs, hidden_size) self.value_optim = Adam(self.value.parameters(), lr=critic_lr) utils.hard_update(self.value_target, self.value) self.value_criterion = nn.MSELoss() else: self.policy = Actor(self.num_inputs, self.action_space, hidden_size, policy_type='DeterministicPolicy') self.policy_optim = Adam(self.policy.parameters(), lr=actor_lr) self.critic_target = QNetwork(self.num_inputs, self.action_space, hidden_size) utils.hard_update(self.critic_target, self.critic) self.policy.cuda() self.value.cuda() self.value_target.cuda() self.critic.cuda() #Statistics Tracker self.q = {'min':None, 'max': None, 'mean':None, 'std':None} self.val = {'min':None, 'max': None, 'mean':None, 'std':None} self.value_loss = {'min':None, 'max': None, 'mean':None, 'std':None} self.policy_loss = {'min':None, 'max': None, 'mean':None, 'std':None} self.mean_loss = {'min':None, 'max': None, 'mean':None, 'std':None} self.std_loss = {'min':None, 'max': None, 'mean':None, 'std':None} self.q_loss = {'min':None, 'max': None, 'mean':None, 'std':None} # def select_action(self, state, eval=False): # state = torch.FloatTensor(state).unsqueeze(0) # if eval == False: # self.policy.train() # action, _, _, _, _ = self.policy.evaluate(state) # else: # self.policy.eval() # _, _, _, action, _ = self.policy.evaluate(state) # # # action = torch.tanh(action) # action = action.detach().cpu().numpy() # return action[0] def update_parameters(self, state_batch, next_state_batch, action_batch, reward_batch, mask_batch, updates, **ignore): # state_batch = torch.FloatTensor(state_batch) # next_state_batch = torch.FloatTensor(next_state_batch) # action_batch = torch.FloatTensor(action_batch) # reward_batch = torch.FloatTensor(reward_batch) # mask_batch = torch.FloatTensor(np.float32(mask_batch)) # reward_batch = reward_batch.unsqueeze(1) # reward_batch = [batch_size, 1] # mask_batch = mask_batch.unsqueeze(1) # mask_batch = [batch_size, 1] """ Use two Q-functions to mitigate positive bias in the policy improvement step that is known to degrade performance of value based methods. Two Q-functions also significantly speed up training, especially on harder task. """ expected_q1_value, expected_q2_value = self.critic(state_batch, action_batch) new_action, log_prob, _, mean, log_std = self.policy.noisy_action(state_batch, return_only_action=False) utils.compute_stats(expected_q1_value, self.q) if self.policy_type == "Gaussian": """ Including a separate function approximator for the soft value can stabilize training. """ expected_value = self.value(state_batch) utils.compute_stats(expected_value, self.val) target_value = self.value_target(next_state_batch) next_q_value = reward_batch + mask_batch * self.gamma * target_value # Reward Scale * r(st,at) - γV(target)(st+1)) else: """ There is no need in principle to include a separate function approximator for the state value. We use a target critic network for deterministic policy and eradicate the value value network completely. """ next_state_action, _, _, _, _, = self.policy.noisy_action(next_state_batch, return_only_action=False) target_critic_1, target_critic_2 = self.critic_target(next_state_batch, next_state_action) target_critic = torch.min(target_critic_1, target_critic_2) next_q_value = reward_batch + mask_batch * self.gamma * target_critic # Reward Scale * r(st,at) - γQ(target)(st+1) """ Soft Q-function parameters can be trained to minimize the soft Bellman residual JQ = 𝔼(st,at)~D[0.5(Q1(st,at) - r(st,at) - γ(𝔼st+1~p[V(st+1)]))^2] ∇JQ = ∇Q(st,at)(Q(st,at) - r(st,at) - γV(target)(st+1)) """ q1_value_loss = self.soft_q_criterion(expected_q1_value, next_q_value.detach()) q2_value_loss = self.soft_q_criterion(expected_q2_value, next_q_value.detach()) utils.compute_stats(q1_value_loss, self.q_loss) q1_new, q2_new = self.critic(state_batch, new_action) expected_new_q_value = torch.min(q1_new, q2_new) if self.policy_type == "Gaussian": """ Including a separate function approximator for the soft value can stabilize training and is convenient to train simultaneously with the other networks Update the V towards the min of two Q-functions in order to reduce overestimation bias from function approximation error. JV = 𝔼st~D[0.5(V(st) - (𝔼at~π[Qmin(st,at) - log π(at|st)]))^2] ∇JV = ∇V(st)(V(st) - Q(st,at) + logπ(at|st)) """ next_value = expected_new_q_value - (self.alpha * log_prob) value_loss = self.value_criterion(expected_value, next_value.detach()) utils.compute_stats(value_loss, self.value_loss) else: pass """ Reparameterization trick is used to get a low variance estimator f(εt;st) = action sampled from the policy εt is an input noise vector, sampled from some fixed distribution Jπ = 𝔼st∼D,εt∼N[logπ(f(εt;st)|st)−Q(st,f(εt;st))] ∇Jπ =∇log π + ([∇at log π(at|st) − ∇at Q(st,at)])∇f(εt;st) """ policy_loss = ((self.alpha * log_prob) - expected_new_q_value) utils.compute_stats(policy_loss, self.policy_loss) policy_loss = policy_loss.mean() # Regularization Loss mean_loss = 0.001 * mean.pow(2) std_loss = 0.001 * log_std.pow(2) utils.compute_stats(mean_loss, self.mean_loss) utils.compute_stats(std_loss, self.std_loss) mean_loss = mean_loss.mean() std_loss = std_loss.mean() policy_loss += mean_loss + std_loss self.critic_optim.zero_grad() q1_value_loss.backward() self.critic_optim.step() self.critic_optim.zero_grad() q2_value_loss.backward() self.critic_optim.step() if self.policy_type == "Gaussian": self.value_optim.zero_grad() value_loss.backward() self.value_optim.step() else: value_loss = torch.tensor(0.) self.policy_optim.zero_grad() policy_loss.backward() self.policy_optim.step() self.total_update += 1 if self.agent_id == 0: self.tracker.update([self.q['mean'], self.q_loss['mean'], self.val['mean'], self.value_loss['mean'] , self.policy_loss['mean'], self.mean_loss['mean'], self.std_loss['mean']], self.total_update) """ We update the target weights to match the current value function weights periodically Update target parameter after every n(args.target_update_interval) updates """ if updates % self.target_update_interval == 0 and self.policy_type == "Deterministic": utils.soft_update(self.critic_target, self.critic, self.tau) elif updates % self.target_update_interval == 0 and self.policy_type == "Gaussian": utils.soft_update(self.value_target, self.value, self.tau) return value_loss.item(), q1_value_loss.item(), q2_value_loss.item(), policy_loss.item() # Save model parameters def save_model(self, env_name, suffix="", actor_path=None, critic_path=None, value_path=None): if not os.path.exists('models/'): os.makedirs('models/') if actor_path is None: actor_path = "models/sac_actor_{}_{}".format(env_name, suffix) if critic_path is None: critic_path = "models/sac_critic_{}_{}".format(env_name, suffix) if value_path is None: value_path = "models/sac_value_{}_{}".format(env_name, suffix) print('Saving models to {}, {} and {}'.format(actor_path, critic_path, value_path)) torch.save(self.value.state_dict(), value_path) torch.save(self.policy.state_dict(), actor_path) torch.save(self.critic.state_dict(), critic_path) # Load model parameters def load_model(self, actor_path, critic_path, value_path): print('Loading models from {}, {} and {}'.format(actor_path, critic_path, value_path)) if actor_path is not None: self.policy.load_state_dict(torch.load(actor_path)) if critic_path is not None: self.critic.load_state_dict(torch.load(critic_path)) if value_path is not None: self.value.load_state_dict(torch.load(value_path))
self.ns = torch.cat((self.ns, torch.Tensor(ns)), 0) self.a = torch.cat((self.a, torch.Tensor(a)), 0) self.r = torch.cat((self.r, torch.Tensor(r)), 0) self.done = torch.cat((self.done, torch.Tensor(done)), 0) self.num_entries = len(self.s) if self.num_entries > DATA_LIMIT: break print('BUFFER LOADED WITH', self.num_entries, 'SAMPLES') # self.s = self.s.pin_memory() # self.ns = self.ns.pin_memory() # self.a = self.a.pin_memory() # self.r = self.r.pin_memory() # self.done = self.done.pin_memory() args = Parameters() pg_model = Actor(args) pg_model.load_state_dict(torch.load('R_Skeleton/models/champ')) evo_model = Actor(args) evo_model.load_state_dict(torch.load('R_Skeleton/rl_models/td3_best0.95_RS_PROP0.9__ADV_-5.0_-7.5_-5.0_0.0')) k = None
def __init__(self, args): # need to intialize rollout_workers to have blue agent self.args = args self.evolver = SSNE( self.args) # this evolver implements neuro-evolution # MP TOOLS self.manager = Manager() self.mutate_algos = [ Mutation_Add(self), Mutation_Delete(self), Mutation_Exchange(self) ] #store all the mutate algorithm objects # Genealogy tool self.genealogy = Genealogy() # Init BUFFER self.replay_buffer = Buffer(1000000, self.args.buffer_gpu) #if SA_FLAG: self.metrics = [] self.last_portfolio = None self.T_max = 30 self.T = self.T_max self.T_min = 0.2 self.decay_rate = 0.975 # Initialize population self.pop = self.manager.list() for _ in range(args.pop_size): wwid = self.genealogy.new_id('evo') if ALGO == 'SAC': self.pop.append( GaussianPolicy(args.state_dim, args.action_dim, args.hidden_size, wwid)) elif ALGO == 'TD3': self.pop.append( Actor(args.state_dim, args.action_dim, wwid, ALGO)) # use ALGO to distinguish differe net architecture elif ALGO == 'dis' or 'TD3_tennis': self.pop.append( Actor(args.state_dim, args.action_dim, wwid, ALGO)) else: assert False, "invalid algorithm type" if ALGO == "SAC": self.best_policy = GaussianPolicy(args.state_dim, args.action_dim, args.hidden_size, -1) else: self.best_policy = Actor(args.state_dim, args.action_dim, -1, ALGO) if ALGO == 'dis': self.average_policy = AverageActor(args.state_dim, args.action_dim, -2, ALGO, self.pop, self.replay_buffer, args.buffer_gpu, args.batch_size, iterations=10) self.average_policy.share_memory() self.best_policy.share_memory() # added by macheng, share the best policy accross processes (used as internal belief update models for blue) # now we assign shared blue_trainer, we should train this agent such that the roll_out workers are also up to date # should make sure that self.best_policy (emergent learner) is also shared if ALGO == 'dis' or 'TD3_tennis': assert hasattr( args, "blue_trainer" ), "must have blue_agent trainer to intialize rollout_worker, see line 109, class Parameter definition" if ALGO == 'dis': trainers = [args.blue_trainer, self.average_policy] else: trainers = [args.blue_trainer, None ] if ALGO == 'TD3_tennis' else [] self.trainers = trainers self.blue_dqn = args.blue_trainer # Turn off gradients and put in eval mod for actor in self.pop: actor = actor.cpu() actor.eval() # Intialize portfolio of learners self.portfolio = [] self.portfolio = initialize_portfolio(self.portfolio, self.args, self.genealogy, PORTFOLIO_ID) self.complement_portfolio = [ ] #complementary of the portfolio, whatever not in the portfolio should be stored here self.total_rollout_bucket = self.manager.list( ) #macheng: we use total_rollout_bucket to represents the whole set of rollout models, now rollout_bukcet dynamically resize according to portforlio, for SA self.rollout_bucket = self.total_rollout_bucket #self.rollout_bucket = self.manager.list() #print("rollout_bucker needs to be updated, main.py line 239 ") for _ in range(len(self.portfolio)): if ALGO == 'SAC': self.rollout_bucket.append( GaussianPolicy(args.state_dim, args.action_dim, args.hidden_size, -1)) else: self.rollout_bucket.append( Actor(args.state_dim, args.action_dim, -1, ALGO)) # Initialize shared data bucket self.data_bucket = self.replay_buffer.tuples ############## MULTIPROCESSING TOOLS ################### # Evolutionary population Rollout workers self.evo_task_pipes = [Pipe() for _ in range(args.pop_size)] self.evo_result_pipes = [Pipe() for _ in range(args.pop_size)] self.evo_workers = [ Process(target=rollout_worker, args=(id, 0, self.evo_task_pipes[id][1], self.evo_result_pipes[id][0], False, self.data_bucket, self.pop, ENV_NAME, None, ALGO, self.trainers)) for id in range(args.pop_size) ] for worker in self.evo_workers: worker.start() self.evo_flag = [True for _ in range(args.pop_size)] # Learner rollout workers self.task_pipes = [Pipe() for _ in range(args.rollout_size)] self.result_pipes = [Pipe() for _ in range(args.rollout_size)] self.workers = [ Process(target=rollout_worker, args=(id, 1, self.task_pipes[id][1], self.result_pipes[id][0], True, self.data_bucket, self.rollout_bucket, ENV_NAME, args.noise_std, ALGO, self.trainers)) for id in range(args.rollout_size) ] for worker in self.workers: worker.start() self.roll_flag = [True for _ in range(args.rollout_size)] # Test bucket self.test_bucket = self.manager.list() if ALGO == 'SAC': self.test_bucket.append( GaussianPolicy(args.state_dim, args.action_dim, args.hidden_size, -1)) else: self.test_bucket.append( Actor(args.state_dim, args.action_dim, -1, ALGO)) # 5 Test workers self.test_task_pipes = [Pipe() for _ in range(TEST_SIZE)] self.test_result_pipes = [Pipe() for _ in range(TEST_SIZE)] self.test_workers = [ Process(target=rollout_worker, args=(id, 2, self.test_task_pipes[id][1], self.test_result_pipes[id][0], False, None, self.test_bucket, ENV_NAME, args.noise_std, ALGO, self.trainers)) for id in range(TEST_SIZE) ] for worker in self.test_workers: worker.start() self.test_flag = False # Meta-learning controller (Resource Distribution) self.allocation = [ ] #Allocation controls the resource allocation across learners for i in range(args.rollout_size): self.allocation.append( i % len(self.portfolio)) #Start uniformly (equal resources) # self.learner_stats = [{'fitnesses': [], 'ep_lens': [], 'value': 0.0, 'visit_count':0} for _ in range(len(self.portfolio))] #Track node statistsitic (each node is a learner), to compute UCB scores # Trackers self.best_score = -np.inf self.gen_frames = 0 self.total_frames = 0 self.best_shaped_score = None self.test_score = None self.test_std = None
def __init__(self, args, id): self.args = args self.id = id ###Initalize neuroevolution module### self.evolver = SSNE(self.args) ########Initialize population self.manager = Manager() self.popn = self.manager.list() for _ in range(args.popn_size): if args.ps == 'trunk': self.popn.append( MultiHeadActor(args.state_dim, args.action_dim, args.hidden_size, args.config.num_agents)) else: if args.algo_name == 'TD3': self.popn.append( Actor(args.state_dim, args.action_dim, args.hidden_size, policy_type='DeterministicPolicy')) else: self.popn.append( Actor(args.state_dim, args.action_dim, args.hidden_size, policy_type='GaussianPolicy')) self.popn[-1].eval() #### INITIALIZE PG ALGO ##### if args.ps == 'trunk': if self.args.is_matd3 or args.is_maddpg: algo_name = 'TD3' if self.args.is_matd3 else 'DDPG' self.algo = MATD3(id, algo_name, args.state_dim, args.action_dim, args.hidden_size, args.actor_lr, args.critic_lr, args.gamma, args.tau, args.savetag, args.aux_save, args.actualize, args.use_gpu, args.config.num_agents, args.init_w) else: self.algo = MultiTD3(id, args.algo_name, args.state_dim, args.action_dim, args.hidden_size, args.actor_lr, args.critic_lr, args.gamma, args.tau, args.savetag, args.aux_save, args.actualize, args.use_gpu, args.config.num_agents, args.init_w) else: if args.algo_name == 'TD3': self.algo = TD3(id, args.algo_name, args.state_dim, args.action_dim, args.hidden_size, args.actor_lr, args.critic_lr, args.gamma, args.tau, args.savetag, args.aux_save, args.actualize, args.use_gpu, args.init_w) else: self.algo = SAC(id, args.state_dim, args.action_dim, args.hidden_size, args.gamma, args.critic_lr, args.actor_lr, args.tau, args.alpha, args.target_update_interval, args.savetag, args.aux_save, args.actualize, args.use_gpu) #### Rollout Actor is a template used for MP ##### self.rollout_actor = self.manager.list() if args.ps == 'trunk': self.rollout_actor.append( MultiHeadActor(args.state_dim, args.action_dim, args.hidden_size, args.config.num_agents)) else: if args.algo_name == 'TD3': self.rollout_actor.append( Actor(args.state_dim, args.action_dim, args.hidden_size, policy_type='DeterministicPolicy')) else: self.rollout_actor.append( Actor(args.state_dim, args.action_dim, args.hidden_size, policy_type='GaussianPolicy')) #Initalize buffer if args.ps == 'trunk': self.buffer = [ Buffer(args.buffer_size, buffer_gpu=False, filter_c=args.filter_c) for _ in range(args.config.num_agents) ] else: self.buffer = Buffer(args.buffer_size, buffer_gpu=False, filter_c=args.filter_c) #Agent metrics self.fitnesses = [[] for _ in range(args.popn_size)] ###Best Policy HOF#### self.champ_ind = 0
class TD3(object): """Classes implementing TD3 and DDPG off-policy learners Parameters: args (object): Parameter class """ def __init__(self, id, algo_name, state_dim, action_dim, hidden_size, actor_lr, critic_lr, gamma, tau, savetag, foldername, actualize, use_gpu, init_w = True): self.algo_name = algo_name; self.gamma = gamma; self.tau = tau; self.total_update = 0; self.agent_id = id; self.actualize = actualize; self.use_gpu = use_gpu self.tracker = utils.Tracker(foldername, ['q_'+savetag, 'qloss_'+savetag, 'policy_loss_'+savetag, 'alz_score'+savetag,'alz_policy'+savetag], '.csv', save_iteration=1000, conv_size=1000) #Initialize actors self.policy = Actor(state_dim, action_dim, hidden_size, policy_type='DeterministicPolicy') if init_w: self.policy.apply(utils.init_weights) self.policy_target = Actor(state_dim, action_dim, hidden_size, policy_type='DeterministicPolicy') utils.hard_update(self.policy_target, self.policy) self.policy_optim = Adam(self.policy.parameters(), actor_lr) self.critic = QNetwork(state_dim, action_dim,hidden_size) if init_w: self.critic.apply(utils.init_weights) self.critic_target = QNetwork(state_dim, action_dim, hidden_size) utils.hard_update(self.critic_target, self.critic) self.critic_optim = Adam(self.critic.parameters(), critic_lr) if actualize: self.ANetwork = ActualizationNetwork(state_dim, action_dim, hidden_size) if init_w: self.ANetwork.apply(utils.init_weights) self.actualize_optim = Adam(self.ANetwork.parameters(), critic_lr) self.actualize_lr = 0.2 if use_gpu: self.ANetwork.cuda() self.loss = nn.MSELoss() if use_gpu: self.policy_target.cuda(); self.critic_target.cuda(); self.policy.cuda(); self.critic.cuda() self.num_critic_updates = 0 #Statistics Tracker #self.action_loss = {'min':None, 'max': None, 'mean':None, 'std':None} self.policy_loss = {'min':None, 'max': None, 'mean':None, 'std':None} self.q_loss = {'min':None, 'max': None, 'mean':None, 'std':None} self.q = {'min':None, 'max': None, 'mean':None, 'std':None} self.alz_score = {'min':None, 'max': None, 'mean':None, 'std':None} self.alz_policy = {'min':None, 'max': None, 'mean':None, 'std':None} #self.val = {'min':None, 'max': None, 'mean':None, 'std':None} #self.value_loss = {'min':None, 'max': None, 'mean':None, 'std':None} def update_parameters(self, state_batch, next_state_batch, action_batch, reward_batch, done_batch, global_reward, num_epoch=1, **kwargs): """Runs a step of Bellman upodate and policy gradient using a batch of experiences Parameters: state_batch (tensor): Current States next_state_batch (tensor): Next States action_batch (tensor): Actions reward_batch (tensor): Rewards done_batch (tensor): Done batch num_epoch (int): Number of learning iteration to run with the same data Returns: None """ if isinstance(state_batch, list): state_batch = torch.cat(state_batch); next_state_batch = torch.cat(next_state_batch); action_batch = torch.cat(action_batch); reward_batch = torch.cat(reward_batch). done_batch = torch.cat(done_batch); global_reward = torch.cat(global_reward) for _ in range(num_epoch): ########### CRITIC UPDATE #################### #Compute next q-val, next_v and target with torch.no_grad(): #Policy Noise policy_noise = np.random.normal(0, kwargs['policy_noise'], (action_batch.size()[0], action_batch.size()[1])) policy_noise = torch.clamp(torch.Tensor(policy_noise), -kwargs['policy_noise_clip'], kwargs['policy_noise_clip']) #Compute next action_bacth next_action_batch = self.policy_target.clean_action(next_state_batch, return_only_action=True) + policy_noise.cuda() if self.use_gpu else policy_noise next_action_batch = torch.clamp(next_action_batch, -1, 1) #Compute Q-val and value of next state masking by done q1, q2 = self.critic_target.forward(next_state_batch, next_action_batch) q1 = (1 - done_batch) * q1 q2 = (1 - done_batch) * q2 #next_val = (1 - done_batch) * next_val #Select which q to use as next-q (depends on algo) if self.algo_name == 'TD3' or self.algo_name == 'TD3_actor_min': next_q = torch.min(q1, q2) elif self.algo_name == 'DDPG': next_q = q1 elif self.algo_name == 'TD3_max': next_q = torch.max(q1, q2) #Compute target q and target val target_q = reward_batch + (self.gamma * next_q) #if self.args.use_advantage: target_val = reward_batch + (self.gamma * next_val) if self.actualize: ##########Actualization Network Update current_Ascore = self.ANetwork.forward(state_batch, action_batch) utils.compute_stats(current_Ascore, self.alz_score) target_Ascore = (self.actualize_lr) * (global_reward * 10.0) + (1 - self.actualize_lr) * current_Ascore.detach() actualize_loss = self.loss(target_Ascore, current_Ascore).mean() self.critic_optim.zero_grad() current_q1, current_q2 = self.critic.forward((state_batch), (action_batch)) utils.compute_stats(current_q1, self.q) dt = self.loss(current_q1, target_q) # if self.args.use_advantage: # dt = dt + self.loss(current_val, target_val) # utils.compute_stats(current_val, self.val) if self.algo_name == 'TD3' or self.algo_name == 'TD3_max': dt = dt + self.loss(current_q2, target_q) utils.compute_stats(dt, self.q_loss) # if self.args.critic_constraint: # if dt.item() > self.args.critic_constraint_w: # dt = dt * (abs(self.args.critic_constraint_w / dt.item())) dt.backward() self.critic_optim.step() self.num_critic_updates += 1 if self.actualize: self.actualize_optim.zero_grad() actualize_loss.backward() self.actualize_optim.step() #Delayed Actor Update if self.num_critic_updates % kwargs['policy_ups_freq'] == 0: actor_actions = self.policy.clean_action(state_batch, return_only_action=False) # # Trust Region constraint # if self.args.trust_region_actor: # with torch.no_grad(): old_actor_actions = self.actor_target.forward(state_batch) # actor_actions = action_batch - old_actor_actions Q1, Q2 = self.critic.forward(state_batch, actor_actions) # if self.args.use_advantage: policy_loss = -(Q1 - val) policy_loss = -Q1 utils.compute_stats(-policy_loss,self.policy_loss) policy_loss = policy_loss.mean() ###Actualzie Policy Update if self.actualize: A1 = self.ANetwork.forward(state_batch, actor_actions) utils.compute_stats(A1, self.alz_policy) policy_loss += -A1.mean()*0.1 self.policy_optim.zero_grad() policy_loss.backward(retain_graph=True) #nn.utils.clip_grad_norm_(self.actor.parameters(), 10) # if self.args.action_loss: # action_loss = torch.abs(actor_actions-0.5) # utils.compute_stats(action_loss, self.action_loss) # action_loss = action_loss.mean() * self.args.action_loss_w # action_loss.backward() # #if self.action_loss[-1] > self.policy_loss[-1]: self.args.action_loss_w *= 0.9 #Decay action_w loss if action loss is larger than policy gradient loss self.policy_optim.step() # if self.args.hard_update: # if self.num_critic_updates % self.args.hard_update_freq == 0: # if self.num_critic_updates % self.args.policy_ups_freq == 0: self.hard_update(self.actor_target, self.actor) # self.hard_update(self.critic_target, self.critic) if self.num_critic_updates % kwargs['policy_ups_freq'] == 0: utils.soft_update(self.policy_target, self.policy, self.tau) utils.soft_update(self.critic_target, self.critic, self.tau) self.total_update += 1 if self.agent_id == 0: self.tracker.update([self.q['mean'], self.q_loss['mean'], self.policy_loss['mean'],self.alz_score['mean'], self.alz_policy['mean']] ,self.total_update)
class TD3(object): """Classes implementing TD3 and DDPG off-policy learners Parameters: args (object): Parameter class """ def to_cuda(self): self.actor.cuda() self.actor_target.cuda() self.critic_target.cuda() self.critic.cuda() def __init__(self, args): self.args = args self.actor = Actor(args) self.actor.apply(utils.init_weights) self.actor_target = Actor(args) self.actor_optim = Adam(self.actor.parameters(), lr=1e-4) self.critic = Critic(args) self.critic.apply(utils.init_weights) self.critic_target = Critic(args) self.critic_optim = Adam(self.critic.parameters(), lr=1e-3) self.gamma = args.gamma self.tau = self.args.tau self.loss = nn.MSELoss() self.hard_update( self.actor_target, self.actor) # Make sure target is with the same weight self.hard_update(self.critic_target, self.critic) self.actor_target.cuda() self.critic_target.cuda() self.actor.cuda() self.critic.cuda() self.num_critic_updates = 0 #Statistics Tracker self.action_loss = {'min': [], 'max': [], 'mean': [], 'std': []} self.policy_loss = {'min': [], 'max': [], 'mean': [], 'std': []} self.critic_loss = {'mean': []} self.q = {'min': [], 'max': [], 'mean': [], 'std': []} self.val = {'min': [], 'max': [], 'mean': [], 'std': []} def compute_stats(self, tensor, tracker): """Computes stats from intermediate tensors Parameters: tensor (tensor): tensor tracker (object): logger Returns: None """ tracker['min'].append(torch.min(tensor).item()) tracker['max'].append(torch.max(tensor).item()) tracker['mean'].append(torch.mean(tensor).item()) tracker['mean'].append(torch.mean(tensor).item()) def update_parameters(self, state_batch, next_state_batch, action_batch, reward_batch, done_batch, dpp, num_epoch=1): """Runs a step of Bellman upodate and policy gradient using a batch of experiences Parameters: state_batch (tensor): Current States next_state_batch (tensor): Next States action_batch (tensor): Actions reward_batch (tensor): Rewards done_batch (tensor): Done batch num_epoch (int): Number of learning iteration to run with the same data Returns: None """ if isinstance(state_batch, list): state_batch = torch.cat(state_batch) next_state_batch = torch.cat(next_state_batch) action_batch = torch.cat(action_batch) reward_batch = torch.cat(reward_batch).done_batch = torch.cat( done_batch) for _ in range(num_epoch): ########### CRITIC UPDATE #################### #Compute next q-val, next_v and target with torch.no_grad(): #Policy Noise policy_noise = np.random.normal( 0, self.args.policy_noise, (action_batch.size()[0], action_batch.size()[1])) policy_noise = torch.clamp(torch.Tensor(policy_noise), -self.args.policy_noise_clip, self.args.policy_noise_clip) #Compute next action_bacth next_action_batch = self.actor_target.forward( next_state_batch) + policy_noise.cuda() next_action_batch = torch.clamp(next_action_batch, 0, 1) #Compute Q-val and value of next state masking by done q1, q2, next_val = self.critic_target.forward( next_state_batch, next_action_batch) q1 = (1 - done_batch) * q1 q2 = (1 - done_batch) * q2 next_val = (1 - done_batch) * next_val next_q = torch.min(q1, q2) #Compute target q and target val target_q = reward_batch + (self.gamma * next_q) target_val = reward_batch + (self.gamma * next_val) self.critic_optim.zero_grad() current_q1, current_q2, current_val = self.critic.forward( (state_batch), (action_batch)) self.compute_stats(current_q1, self.q) dt = self.loss(current_q1, target_q) dt = dt + self.loss(current_val, target_val) self.compute_stats(current_val, self.val) dt = dt + self.loss(current_q2, target_q) self.critic_loss['mean'].append(dt.item()) dt.backward() self.critic_optim.step() self.num_critic_updates += 1 #Delayed Actor Update if self.num_critic_updates % self.args.policy_ups_freq == 0: actor_actions = self.actor.forward(state_batch) if dpp: policy_loss = -self.shape_dpp(self.critic, self.actor, state_batch, self.args.sensor_model) else: Q1, Q2, val = self.critic.forward(state_batch, actor_actions) policy_loss = -(Q1 - val) self.compute_stats(policy_loss, self.policy_loss) policy_loss = policy_loss.mean() self.actor_optim.zero_grad() policy_loss.backward(retain_graph=True) if self.args.action_loss: action_loss = torch.abs(actor_actions - 0.5) self.compute_stats(action_loss, self.action_loss) action_loss = action_loss.mean() * self.args.action_loss_w action_loss.backward() #if self.action_loss[-1] > self.policy_loss[-1]: self.args.action_loss_w *= 0.9 #Decay action_w loss if action loss is larger than policy gradient loss self.actor_optim.step() if self.num_critic_updates % self.args.policy_ups_freq == 0: self.soft_update(self.actor_target, self.actor, self.tau) self.soft_update(self.critic_target, self.critic, self.tau) def soft_update(self, target, source, tau): """Soft update from target network to source Parameters: target (object): A pytorch model source (object): A pytorch model tau (float): Tau parameter Returns: None """ for target_param, param in zip(target.parameters(), source.parameters()): target_param.data.copy_(target_param.data * (1.0 - tau) + param.data * tau) def hard_update(self, target, source): """Hard update (clone) from target network to source Parameters: target (object): A pytorch model source (object): A pytorch model Returns: None """ for target_param, param in zip(target.parameters(), source.parameters()): target_param.data.copy_(param.data) def shape_dpp(self, critic, actor, state, sensor_model): Q1, _, val = critic((state), actor((state))) original_T = Q1 - val all_adv = [original_T] state = utils.to_numpy(state.cpu()) #mid_index = int(180 / self.args.angle_res) coupling = self.args.coupling max_ind = int(360 / self.args.angle_res) perturb_index = [ np.argwhere(state[i, 0:max_ind] != -1).flatten() for i in range(len(state)) ] for i, entry in enumerate(perturb_index): np.random.shuffle(entry) if len(entry) < coupling: perturb_index[i] = np.tile(entry, (coupling, 1)).flatten() for coupling_mag in range(coupling): empty_ind = [int(entry[coupling_mag]) for entry in perturb_index] if sensor_model == 'density': for i, ind in enumerate(empty_ind): state[i, ind] = 1.0 elif sensor_model == 'closets': for i, ind in enumerate(empty_ind): state[i, ind] = 1.0 shaped_state = utils.to_tensor(state).cuda() Q1, _, val = critic((shaped_state), actor((shaped_state))) adv = (Q1 - val) / (coupling_mag + 1) all_adv.append(adv) all_adv = torch.cat(all_adv, 1) dpp_max = torch.max(all_adv, 1)[0].unsqueeze(1) with torch.no_grad(): normalizer = dpp_max / original_T return original_T * normalizer
def mutate(root, info, input=None): ok = True actor_instance = Actor(name=input.name, pic=input.pic) actor_instance.save() return CreateActor(ok=ok, actor=actor_instance)
class CERL_Agent: """Main CERL class containing all methods for CERL Parameters: args (int): Parameter class with all the parameters """ def __init__(self, args): # need to intialize rollout_workers to have blue agent self.args = args self.evolver = SSNE( self.args) # this evolver implements neuro-evolution # MP TOOLS self.manager = Manager() self.mutate_algos = [ Mutation_Add(self), Mutation_Delete(self), Mutation_Exchange(self) ] #store all the mutate algorithm objects # Genealogy tool self.genealogy = Genealogy() # Init BUFFER self.replay_buffer = Buffer(1000000, self.args.buffer_gpu) #if SA_FLAG: self.metrics = [] self.last_portfolio = None self.T_max = 30 self.T = self.T_max self.T_min = 0.2 self.decay_rate = 0.975 # Initialize population self.pop = self.manager.list() for _ in range(args.pop_size): wwid = self.genealogy.new_id('evo') if ALGO == 'SAC': self.pop.append( GaussianPolicy(args.state_dim, args.action_dim, args.hidden_size, wwid)) elif ALGO == 'TD3': self.pop.append( Actor(args.state_dim, args.action_dim, wwid, ALGO)) # use ALGO to distinguish differe net architecture elif ALGO == 'dis' or 'TD3_tennis': self.pop.append( Actor(args.state_dim, args.action_dim, wwid, ALGO)) else: assert False, "invalid algorithm type" if ALGO == "SAC": self.best_policy = GaussianPolicy(args.state_dim, args.action_dim, args.hidden_size, -1) else: self.best_policy = Actor(args.state_dim, args.action_dim, -1, ALGO) if ALGO == 'dis': self.average_policy = AverageActor(args.state_dim, args.action_dim, -2, ALGO, self.pop, self.replay_buffer, args.buffer_gpu, args.batch_size, iterations=10) self.average_policy.share_memory() self.best_policy.share_memory() # added by macheng, share the best policy accross processes (used as internal belief update models for blue) # now we assign shared blue_trainer, we should train this agent such that the roll_out workers are also up to date # should make sure that self.best_policy (emergent learner) is also shared if ALGO == 'dis' or 'TD3_tennis': assert hasattr( args, "blue_trainer" ), "must have blue_agent trainer to intialize rollout_worker, see line 109, class Parameter definition" if ALGO == 'dis': trainers = [args.blue_trainer, self.average_policy] else: trainers = [args.blue_trainer, None ] if ALGO == 'TD3_tennis' else [] self.trainers = trainers self.blue_dqn = args.blue_trainer # Turn off gradients and put in eval mod for actor in self.pop: actor = actor.cpu() actor.eval() # Intialize portfolio of learners self.portfolio = [] self.portfolio = initialize_portfolio(self.portfolio, self.args, self.genealogy, PORTFOLIO_ID) self.complement_portfolio = [ ] #complementary of the portfolio, whatever not in the portfolio should be stored here self.total_rollout_bucket = self.manager.list( ) #macheng: we use total_rollout_bucket to represents the whole set of rollout models, now rollout_bukcet dynamically resize according to portforlio, for SA self.rollout_bucket = self.total_rollout_bucket #self.rollout_bucket = self.manager.list() #print("rollout_bucker needs to be updated, main.py line 239 ") for _ in range(len(self.portfolio)): if ALGO == 'SAC': self.rollout_bucket.append( GaussianPolicy(args.state_dim, args.action_dim, args.hidden_size, -1)) else: self.rollout_bucket.append( Actor(args.state_dim, args.action_dim, -1, ALGO)) # Initialize shared data bucket self.data_bucket = self.replay_buffer.tuples ############## MULTIPROCESSING TOOLS ################### # Evolutionary population Rollout workers self.evo_task_pipes = [Pipe() for _ in range(args.pop_size)] self.evo_result_pipes = [Pipe() for _ in range(args.pop_size)] self.evo_workers = [ Process(target=rollout_worker, args=(id, 0, self.evo_task_pipes[id][1], self.evo_result_pipes[id][0], False, self.data_bucket, self.pop, ENV_NAME, None, ALGO, self.trainers)) for id in range(args.pop_size) ] for worker in self.evo_workers: worker.start() self.evo_flag = [True for _ in range(args.pop_size)] # Learner rollout workers self.task_pipes = [Pipe() for _ in range(args.rollout_size)] self.result_pipes = [Pipe() for _ in range(args.rollout_size)] self.workers = [ Process(target=rollout_worker, args=(id, 1, self.task_pipes[id][1], self.result_pipes[id][0], True, self.data_bucket, self.rollout_bucket, ENV_NAME, args.noise_std, ALGO, self.trainers)) for id in range(args.rollout_size) ] for worker in self.workers: worker.start() self.roll_flag = [True for _ in range(args.rollout_size)] # Test bucket self.test_bucket = self.manager.list() if ALGO == 'SAC': self.test_bucket.append( GaussianPolicy(args.state_dim, args.action_dim, args.hidden_size, -1)) else: self.test_bucket.append( Actor(args.state_dim, args.action_dim, -1, ALGO)) # 5 Test workers self.test_task_pipes = [Pipe() for _ in range(TEST_SIZE)] self.test_result_pipes = [Pipe() for _ in range(TEST_SIZE)] self.test_workers = [ Process(target=rollout_worker, args=(id, 2, self.test_task_pipes[id][1], self.test_result_pipes[id][0], False, None, self.test_bucket, ENV_NAME, args.noise_std, ALGO, self.trainers)) for id in range(TEST_SIZE) ] for worker in self.test_workers: worker.start() self.test_flag = False # Meta-learning controller (Resource Distribution) self.allocation = [ ] #Allocation controls the resource allocation across learners for i in range(args.rollout_size): self.allocation.append( i % len(self.portfolio)) #Start uniformly (equal resources) # self.learner_stats = [{'fitnesses': [], 'ep_lens': [], 'value': 0.0, 'visit_count':0} for _ in range(len(self.portfolio))] #Track node statistsitic (each node is a learner), to compute UCB scores # Trackers self.best_score = -np.inf self.gen_frames = 0 self.total_frames = 0 self.best_shaped_score = None self.test_score = None self.test_std = None # trainer contains the blue_dqn to be trained, and the red model used for belief update, red_actor is the actual red agent trained against # id is the actual red agent id def _update_SA_temperature(self): self.T = max(self.T * self.decay_rate, self.T_min) def _get_accept_rate(self): if RANDOM_WALK: return 1.0 else: if self.metrics[-1] > self.metrics[-2]: return 1.0 else: return np.exp((self.metrics[-1] - self.metrics[-2]) / self.T) def _mutate(self): while True: mutate_algo_index = random.choice(range(3)) if self._try_mutate(mutate_algo_index): return def _try_mutate(self, algo_index): # 0 for add, 1 for delete, 2 for exchange return self.mutate_algos[algo_index].try_mutate() def simulated_annealing(self, metric): #take in the current metric self.metrics.append(metric) if self.last_portfolio: #has last_portfolio accept_rate = self._get_accept_rate() #based on self.metrics[-2:] self._update_SA_temperature() if np.random.random() > accept_rate: #reject self.portfolio = self.last_portfolio self.complement_portfolio = self.last_complement_portfolio self.last_portfolio = copy.copy( self.portfolio) #maintain a shallow copy as self.last_complement_portfolio = copy.copy(self.complement_portfolio) self._mutate() #perturb the portfolio # update rollout_bucket size, only the first len(self.portfolio) rollout_buckets are visible self.update_rollout_bucket() # update allocation, to be compatible with the current portfolio self.update_allocation() def update_rollout_bucket(self): self.rollout_bucket = self.total_rollout_bucket[:len(self.portfolio)] def train_blue_dqn( self, trainers, env_name, gen, ALGO='dis', pomdp_adv=False ): #in this method, rollout and training are done together, opponent sampled from the population NUM_EPISODE = 100 #train 100 episodes for the blue to converge to the new best response to red EPS_START = max(1.0 * 0.5**(gen - 10), 0.15) if gen >= 10 else 1.0 #initial epsilon EPS_END = 0.05 EPS_DECAY = 0.995 if ALGO == 'dis': # make env with blue and red policy agent inside, assert trainers is not None dis_env = make_self_play_env( seed=np.random.choice(np.array(range(len(self.pop)))), return_policy_agent=False, trainers=trainers )[0] # trainer if not None, first is the shared DQN agent, second is the best red policy env = EnvironmentWrapper( env_name, ALGO, dis_env, 0) # the "0" is the index for training blue agent elif ALGO == 'TD3_tennis': no_graphics = not RENDER tennis_env = make_tennis_env.TennisEnvFactory( seed=np.random.choice(np.array(range(len(self.pop)))), no_graphics=no_graphics, pid=-1).getEnv()[0] env = EnvironmentWrapper('Tennis', ALGO, tennis_env, 0) else: env = EnvironmentWrapper(env_name, ALGO) blue_dqn = trainers[0] average_reward = 0 eps = EPS_START average_red_reward = 0 red_count = 0 average_actual_blue_reward = 0 blue_count = 0 for it in range(NUM_EPISODE): if not pomdp_adv: #if pomdp_adv, make sure that TD3_actor is never used id = np.random.choice(np.array(range(len(self.pop)))) red_actor = self.pop[id] env.set_TD3_actor(red_actor) fitness = 0.0 #here fitness if simplely reward total_frame = 0 state = env.reset() env.randomize_neu_adv() if pomdp_adv: env.try_set_pomdp_adv( ) #try to set if opponent to pomdp adv if opponent is adversary, else do nothing render_flag = (np.random.random() < 0.05) while True: # unless done action = blue_dqn.act(state, eps=eps) # action = utils.to_numpy(action) next_state, reward, done, info = env.step( copy.deepcopy(action), use_actual_reward=DRQN ) #after calling env.step, evaluator initialized later does not work #should be something wrong with the internal red model? blue_dqn.step(state, action, reward, next_state, done) if render_flag and self.args.render: env.render() # next_state = utils.to_tensor(np.array(next_state)).unsqueeze(0) state = next_state fitness += reward total_frame += 1 # DONE FLAG IS Received if done: average_red_reward += env.get_red_reward( ) if env.get_red_reward() is not None else 0 average_actual_blue_reward += env.get_blue_actual_reward( ) if env.get_blue_actual_reward() is not None else 0 red_count += 1 if env.get_red_reward() is not None else 0 blue_count += 1 if env.get_blue_actual_reward( ) is not None else 0 if render_flag: env.env.close() break average_reward += fitness eps = max(EPS_END, EPS_DECAY * eps) if gen >= 10 and gen % 5 == 0: blue_dqn.save_net('./pytorch_models/train_blue_dqn_step_' + str(gen) + '.pth') average_reward /= NUM_EPISODE if red_count != 0: average_red_reward /= red_count if blue_count != 0: average_actual_blue_reward /= blue_count return average_reward, average_red_reward, average_actual_blue_reward def evaluate_training_fixed_blue( self): #this evaluate against the training opponent (red pop) self.evaluator.pomdp_adv = False return self.evaluator.evaluate_fixed_agents(self.trainers[0], self.trainers[1], self.pop) def train(self, gen, frame_tracker): """Main training loop to do rollouts, neureoevolution, and policy gradients Parameters: gen (int): Current epoch of training Returns: None """ ################ START ROLLOUTS ############## # Start Evolution rollouts if not ISOLATE_PG: for id, actor in enumerate(self.pop): if self.evo_flag[id]: self.evo_task_pipes[id][0].send((id, gen)) self.evo_flag[id] = False # Sync all learners actor to cpu (rollout) actor # (update rollout parameter using the learner parameter, such that rollout worker is up to date) for i, learner in enumerate(self.portfolio): #number of learner learner.algo.actor.cpu() utils.hard_update( self.rollout_bucket[i], learner.algo.actor ) #rollout bucket is now synchronized with learner to perform rollout for learner actors if torch.cuda.is_available(): learner.algo.actor.cuda() # Start Learner rollouts for rollout_id, learner_id in enumerate( self.allocation): #number of rollout_size if self.roll_flag[rollout_id]: self.task_pipes[rollout_id][0].send( (learner_id, gen) ) #allocation record the id of the learner that bucket should run, so rollout_id is the id of rollout_bucket self.roll_flag[rollout_id] = False # Start Test rollouts if gen % 5 == 0: self.test_flag = True for pipe in self.test_task_pipes: pipe[0].send((0, gen)) ############# UPDATE PARAMS USING GRADIENT DESCENT ########## # main training loop if self.replay_buffer.__len__( ) > self.args.batch_size * 10: ###BURN IN PERIOD self.replay_buffer.tensorify( ) # Tensorify the buffer for fast sampling # Spin up threads for each learner threads = [ threading.Thread( target=learner.update_parameters, args=(self.replay_buffer, self.args.buffer_gpu, self.args.batch_size, int(self.gen_frames * self.args.gradperstep))) for learner in self.portfolio ] #macheng: do we want to train all the learners? # Start threads for thread in threads: thread.start() # Join threads for thread in threads: thread.join() # Now update average_policy #self.average_policy.cuda() if ALGO == 'dis': self.average_policy.update( ) #update the average_policy parameter with supervised learning self.gen_frames = 0 #########Visualize Learner Critic Function################# # if self.replay_buffer.__len__() % 2500 == 0: # visualize_critic(self.portfolio[2], make_self_play_env(trainers=[[],[]])[0], 50) #arguments: Learner, env, N_GRID ########## SOFT -JOIN ROLLOUTS FOR EVO POPULATION ############ if not ISOLATE_PG: all_fitness = [] all_net_ids = [] all_eplens = [] while True: for i in range(self.args.pop_size): if self.evo_result_pipes[i][1].poll(): entry = self.evo_result_pipes[i][1].recv() all_fitness.append(entry[1]) all_net_ids.append(entry[0]) all_eplens.append(entry[2]) self.gen_frames += entry[2] self.total_frames += entry[2] self.evo_flag[i] = True # Soft-join (50%) if len(all_fitness ) / self.args.pop_size >= self.args.asynch_frac: break ########## HARD -JOIN ROLLOUTS FOR LEARNER ROLLOUTS ############ for i in range(self.args.rollout_size): entry = self.result_pipes[i][1].recv() learner_id = entry[0] fitness = entry[1] num_frames = entry[2] self.portfolio[learner_id].update_stats(fitness, num_frames) self.gen_frames += num_frames self.total_frames += num_frames if fitness > self.best_score: self.best_score = fitness self.roll_flag[i] = True # Referesh buffer (housekeeping tasks - pruning to keep under capacity) self.replay_buffer.referesh() ######################### END OF PARALLEL ROLLOUTS ################ ############ PROCESS MAX FITNESS ############# # ms:best policy is always up to date # so here the best learner is saved if not ISOLATE_PG: champ_index = all_net_ids[all_fitness.index(max(all_fitness))] utils.hard_update(self.test_bucket[0], self.pop[champ_index]) if max(all_fitness) > self.best_score: self.best_score = max(all_fitness) utils.hard_update(self.best_policy, self.pop[champ_index]) if SAVE: torch.save( self.pop[champ_index].state_dict(), self.args.aux_folder + ENV_NAME + '_best' + SAVETAG) print("Best policy saved with score", '%.2f' % max(all_fitness)) else: #Run PG in isolation utils.hard_update(self.test_bucket[0], self.rollout_bucket[0]) ###### TEST SCORE ###### if self.test_flag: self.test_flag = False test_scores = [] for pipe in self.test_result_pipes: #Collect all results entry = pipe[1].recv() test_scores.append(entry[1]) test_scores = np.array(test_scores) test_mean = np.mean(test_scores) test_std = (np.std(test_scores)) # Update score to trackers frame_tracker.update([test_mean], self.total_frames) else: test_mean, test_std = None, None # NeuroEvolution's probabilistic selection and recombination step # ms: this epoch() method implements neuro-evolution if not ISOLATE_PG: #seems pop_size and rollout_size must be 10, otherwise this will produce error if gen % 5 == 0: self.evolver.epoch( gen, self.genealogy, self.pop, all_net_ids, all_fitness, self.rollout_bucket ) #this method also copies learner to evoler else: self.evolver.epoch(gen, self.genealogy, self.pop, all_net_ids, all_fitness, []) # META LEARNING - RESET ALLOCATION USING UCB if gen % 1 == 0: self.update_allocation() # Metrics if not ISOLATE_PG: champ_len = all_eplens[all_fitness.index(max(all_fitness))] champ_wwid = int(self.pop[champ_index].wwid.item()) max_fit = max(all_fitness) else: champ_len = num_frames champ_wwid = int(self.rollout_bucket[0].wwid.item()) all_fitness = [fitness] max_fit = fitness all_eplens = [num_frames] return max_fit, champ_len, all_fitness, all_eplens, test_mean, test_std, champ_wwid def update_allocation(self): self.allocation = ucb(len(self.allocation), self.portfolio, self.args.ucb_coefficient) def sim_and_eval_POMDP(self): self.evaluator = Evaluator( self, 5, self.trainers, pomdp_adv=True) # evaluator must be created before train_dqn for gen in range(1000000): print('gen=', gen) blue_score, red_score, actual_blue_score = agent.train_blue_dqn( agent.trainers, ENV_NAME, gen, ALGO='dis', pomdp_adv=True) print('Env', ENV_NAME, 'Gen', gen, ", Training average: Blue agent score: ", blue_score, " Red score: ", red_score, " Actual blue score: ", actual_blue_score) blue_score, red_score, actual_blue_score = self.evaluator.evaluate( ) print("Evaluation result: Blue agent score: ", blue_score, " Red score: ", red_score, " Actual blue score: ", actual_blue_score)
class DDPG(object): def __init__(self, args): self.args = args self.actor = Actor(args, init=True) self.actor_target = Actor(args, init=True) self.actor_optim = Adam(self.actor.parameters(), lr=0.5e-4) self.critic = Critic(args) self.critic_target = Critic(args) self.critic_optim = Adam(self.critic.parameters(), lr=0.5e-3) self.gamma = args.gamma self.tau = self.args.tau self.loss = nn.MSELoss() hard_update(self.actor_target, self.actor) # Make sure target is with the same weight hard_update(self.critic_target, self.critic) def update_parameters(self, batch): state_batch = torch.cat(batch.state) next_state_batch = torch.cat(batch.next_state) action_batch = torch.cat(batch.action) reward_batch = torch.cat(batch.reward) if self.args.use_done_mask: done_batch = torch.cat(batch.done) #Load everything to GPU if not already if self.args.is_memory_cuda and not self.args.is_cuda: self.actor.cuda() self.actor_target.cuda() self.critic_target.cuda() self.critic.cuda() state_batch = state_batch.cuda() next_state_batch = next_state_batch.cuda() action_batch = action_batch.cuda() reward_batch = reward_batch.cuda() if self.args.use_done_mask: done_batch = done_batch.cuda() #Critic Update next_action_batch = self.actor_target.forward(next_state_batch) with torch.no_grad(): next_q = self.critic_target.forward(next_state_batch, next_action_batch) if self.args.use_done_mask: next_q = next_q * (1 - done_batch.float()) #Done mask target_q = reward_batch + (self.gamma * next_q) self.critic_optim.zero_grad() current_q = self.critic.forward((state_batch), (action_batch)) dt = self.loss(current_q, target_q) dt.backward() nn.utils.clip_grad_norm_(self.critic.parameters(), 10) self.critic_optim.step() #Actor Update self.actor_optim.zero_grad() policy_loss = -self.critic.forward( (state_batch), self.actor.forward((state_batch))) policy_loss = policy_loss.mean() policy_loss.backward() nn.utils.clip_grad_norm_(self.critic.parameters(), 10) self.actor_optim.step() soft_update(self.actor_target, self.actor, self.tau) soft_update(self.critic_target, self.critic, self.tau) #Nets back to CPU if using memory_cuda if self.args.is_memory_cuda and not self.args.is_cuda: self.actor.cpu() self.actor_target.cpu() self.critic_target.cpu() self.critic.cpu()
def __init__(self, args): self.args = args self.evolver = SSNE(self.args) #MP TOOLS self.manager = Manager() #Genealogy tool self.genealogy = Genealogy() #Initialize population self.pop = self.manager.list() for _ in range(args.pop_size): wwid = self.genealogy.new_id('evo') if ALGO == 'SAC': self.pop.append( GaussianPolicy(args.state_dim, args.action_dim, args.hidden_size, wwid)) else: self.pop.append(Actor(args.state_dim, args.action_dim, wwid)) if ALGO == "SAC": self.best_policy = GaussianPolicy(args.state_dim, args.action_dim, args.hidden_size, -1) else: self.best_policy = Actor(args.state_dim, args.action_dim, -1) #Turn off gradients and put in eval mod for actor in self.pop: actor = actor.cpu() actor.eval() #Init BUFFER self.replay_buffer = Buffer(1000000, self.args.buffer_gpu) #Intialize portfolio of learners self.portfolio = [] self.portfolio = initialize_portfolio(self.portfolio, self.args, self.genealogy, PORTFOLIO_ID) self.rollout_bucket = self.manager.list() for _ in range(len(self.portfolio)): if ALGO == 'SAC': self.rollout_bucket.append( GaussianPolicy(args.state_dim, args.action_dim, args.hidden_size, -1)) else: self.rollout_bucket.append( Actor(args.state_dim, args.action_dim, -1)) # Initialize shared data bucket self.data_bucket = self.replay_buffer.tuples ############## MULTIPROCESSING TOOLS ################### #Evolutionary population Rollout workers self.evo_task_pipes = [Pipe() for _ in range(args.pop_size)] self.evo_result_pipes = [Pipe() for _ in range(args.pop_size)] self.evo_workers = [ Process(target=rollout_worker, args=(id, self.evo_task_pipes[id][1], self.evo_result_pipes[id][0], False, self.data_bucket, self.pop, ENV_NAME, None, ALGO)) for id in range(args.pop_size) ] for worker in self.evo_workers: worker.start() self.evo_flag = [True for _ in range(args.pop_size)] #Learner rollout workers self.task_pipes = [Pipe() for _ in range(args.rollout_size)] self.result_pipes = [Pipe() for _ in range(args.rollout_size)] self.workers = [ Process(target=rollout_worker, args=(id, self.task_pipes[id][1], self.result_pipes[id][0], True, self.data_bucket, self.rollout_bucket, ENV_NAME, args.noise_std, ALGO)) for id in range(args.rollout_size) ] for worker in self.workers: worker.start() self.roll_flag = [True for _ in range(args.rollout_size)] #Test bucket self.test_bucket = self.manager.list() if ALGO == 'SAC': self.test_bucket.append( GaussianPolicy(args.state_dim, args.action_dim, args.hidden_size, -1)) else: self.test_bucket.append(Actor(args.state_dim, args.action_dim, -1)) #5 Test workers self.test_task_pipes = [Pipe() for _ in range(TEST_SIZE)] self.test_result_pipes = [Pipe() for _ in range(TEST_SIZE)] self.test_workers = [ Process(target=rollout_worker, args=(id, self.test_task_pipes[id][1], self.test_result_pipes[id][0], False, None, self.test_bucket, ENV_NAME, args.noise_std, ALGO)) for id in range(TEST_SIZE) ] for worker in self.test_workers: worker.start() self.test_flag = False #Meta-learning controller (Resource Distribution) self.allocation = [ ] #Allocation controls the resource allocation across learners for i in range(args.rollout_size): self.allocation.append( i % len(self.portfolio)) #Start uniformly (equal resources) #self.learner_stats = [{'fitnesses': [], 'ep_lens': [], 'value': 0.0, 'visit_count':0} for _ in range(len(self.portfolio))] #Track node statistsitic (each node is a learner), to compute UCB scores #Trackers self.best_score = 0.0 self.gen_frames = 0 self.total_frames = 0 self.best_shaped_score = None self.test_score = None self.test_std = None
class Evaluator(object): def __init__( self, CERL_agent, num_workers, trainers, pomdp_adv=False ): #trainers first is the blue agent and second is the red model self.num_workers = num_workers self.trainers = trainers self.pomdp_adv = pomdp_adv self.args = CERL_agent.args self.drqn = CERL_agent.args.drqn #denote if blue uses drqn if self.pomdp_adv: self.trainers = [trainers[0], None] #make sure the red model is never used self.buffer_gpu = CERL_agent.args.buffer_gpu self.batch_size = CERL_agent.args.batch_size self.algo = CERL_agent.args.algo self.state_dim = CERL_agent.args.state_dim self.action_dim = CERL_agent.args.action_dim self.buffer = Buffer(BUFFER_SIZE, self.buffer_gpu) #initialize own replay buffer self.data_bucket = self.buffer.tuples self.evo_task_pipes = [Pipe() for _ in range(self.num_workers)] self.evo_result_pipes = [Pipe() for _ in range(self.num_workers)] self.actual_red_worker = Actor( CERL_agent.args.state_dim, CERL_agent.args.action_dim, -1, 'dis') #this model is shared accross the workers self.actual_red_worker.share_memory() self.td3args = { 'policy_noise': 0.2, 'policy_noise_clip': 0.5, 'policy_ups_freq': 2, 'action_low': CERL_agent.args.action_low, 'action_high': CERL_agent.args.action_high, 'cerl_args': self.args } self.renew_learner( ) #now we are not using new learner for each iteration self.rollout_bucket = [ self.actual_red_worker for i in range(num_workers) ] self.workers = [ Process(target=rollout_worker, args=(id, 3, self.evo_task_pipes[id][1], self.evo_result_pipes[id][0], False, self.data_bucket, self.rollout_bucket, 'dummy_name', None, 'dis', self.trainers, False, self.pomdp_adv)) for id in range(num_workers) ] for worker in self.workers: worker.start() self.evo_flag = [True for _ in range(self.num_workers)] #def initialize(self, actor_in): #use the given actor parameter to initialize the red actor # utils.hard_update(self.actual_red_actor, actor_in) def renew_learner( self ): #create a new learning agent, with randomized initial parameter self.learner = Learner(-1, self.algo, self.state_dim, self.action_dim, actor_lr=5e-5, critic_lr=1e-3, gamma=0.99, tau=5e-3, init_w=True, **self.td3args) self.actual_red_actor = self.learner.algo.actor def collect_trajectory(self): utils.hard_update(self.actual_red_worker, self.actual_red_actor) #first snyc the actor #launch rollout_workers for id, actor in enumerate(self.rollout_bucket): if self.evo_flag[id]: self.evo_task_pipes[id][0].send( (id, 0)) #second argument in send is dummy self.evo_flag[id] = False #wait for the rollout to complete and record fitness all_fitness = [] for i in range(self.num_workers): entry = self.evo_result_pipes[i][1].recv() all_fitness.append(entry[1]) self.evo_flag[i] = True self.buffer.referesh() #update replay buffer return all_fitness def train_red( self, training_iterations ): #alternate between collect_trajectory and parameter update while self.buffer.__len__() < self.batch_size * 10: ###BURN IN PERIOD self.collect_trajectory() for i in range(training_iterations): self.collect_trajectory() self.buffer.tensorify() # Tensorify the buffer for fast sampling self.learner.update_parameters(self.buffer, self.buffer_gpu, self.batch_size, 2) #2 update steps def evaluate( self ): #evaluate the quality of blue agent policy, by training a red against it, after evaluation, erase the reply buffer and renew learner self.train_red(TRAIN_ITERATION) self.clear_buffer() #self.renew_learner() return self.evaluate_fixed_agents( self.trainers[0], self.trainers[1], [self.actual_red_actor ]) #calculate the mean and std of the evaluation metric def evaluate_fixed_agents( self, blue_dqn, red_model, red_actor_list, num_iterations=25 ): #evaluate the performance given agents, use random neutral and red agent if self.algo == 'dis': # make env with blue and red policy agent inside, dis_env = make_self_play_env( seed=0, return_policy_agent=False, trainers=[blue_dqn, red_model] )[0] # trainer if not None, first is the shared DQN agent, second is the best red policy env = EnvironmentWrapper( '', self.algo, dis_env, 0) # the "0" is the index for training blue agent elif self.algo == 'TD3_tennis': tennis_env = make_tennis_env.TennisEnvFactory( seed=np.random.choice(np.array(range(len(self.pop)))), no_graphics=True, pid=-1).getEnv()[0] env = EnvironmentWrapper('Tennis', self.algo, tennis_env, 0) else: raise Exception("only work for 'dis' envir?") average_reward = 0 eps = 0 average_red_reward = 0 red_count = 0 average_actual_blue_reward = 0 blue_count = 0 belief_and_true_type_list = [] assert len(red_actor_list ) is not None, "make sure to input a list of possible red" for it in range(num_iterations): belief_and_true_type = [] if not self.pomdp_adv: # if pomdp_adv, make sure that TD3_actor is never used red_actor = random.choice(red_actor_list) env.set_TD3_actor(red_actor) fitness = 0.0 # here fitness if simplely reward state = env.reset() belief_and_true_type.append(env.belief_and_true_type()) env.randomize_neu_adv() if self.pomdp_adv: env.try_set_pomdp_adv( ) # try to set if opponent to pomdp adv if opponent is adversary, else do nothing render_flag = (np.random.random() < 0.05) while True: # unless done action = blue_dqn.act(state, eps=eps) next_state, reward, done, info = env.step( copy.deepcopy(action), use_actual_reward=self.drqn) belief_and_true_type.append(env.belief_and_true_type()) if render_flag and self.args.render: env.render() state = next_state fitness += reward if done: average_red_reward += env.get_red_reward( ) if env.get_red_reward() is not None else 0 average_actual_blue_reward += env.get_blue_actual_reward( ) if env.get_blue_actual_reward() is not None else 0 red_count += 1 if env.get_red_reward() is not None else 0 blue_count += 1 if env.get_blue_actual_reward( ) is not None else 0 if render_flag: env.env.close() break belief_and_true_type_list.append(belief_and_true_type) average_reward += fitness average_reward /= num_iterations if red_count != 0: average_red_reward /= red_count if blue_count != 0: average_actual_blue_reward /= blue_count return average_reward, average_red_reward, average_actual_blue_reward, belief_and_true_type_list def clear_buffer(self): self.buffer.clear_buffer_data() #reinitialize replay buffer def kill_processes(self): for id, actor in enumerate(self.rollout_bucket): self.evo_task_pipes[id][0].send( ('TERMINATE', 0)) #second argument in send is dummy def __del__(self): self.kill_processes()