def __init__(self, env_name, gamma=0.98): self.env_name = env_name self.action_space = gym.make(self.env_name).action_space.n self.q_network = QNetwork(self.action_space) self.target_q_network = QNetwork(self.action_space) self.gamma = gamma self.optimizer = tf.keras.optimizers.Adam(lr=0.001) self.setup()
def __init__(self, env_id: str, logdir: Path): self.env_id = env_id self.summary_writer = tf.summary.create_file_writer( str(logdir)) if logdir else None self.action_space = gym.make(self.env_id).action_space.shape[0] self.replay_buffer = ReplayBuffer(maxlen=10000) self.policy = GaussianPolicyNetwork(action_space=self.action_space) self.target_policy = GaussianPolicyNetwork( action_space=self.action_space) self.critic = QNetwork() self.target_critic = QNetwork() self.log_temperature = tf.Variable(1.) self.log_alpha_mu = tf.Variable(1.) self.log_alpha_sigma = tf.Variable(1.) self.eps = 0.1 self.eps_mu = 0.01 self.eps_sigma = 0.001 self.policy_optimizer = tf.keras.optimizers.Adam(lr=0.0005) self.critic_optimizer = tf.keras.optimizers.Adam(lr=0.0005) self.temperature_optimizer = tf.keras.optimizers.Adam(lr=0.0005) self.alpha_optimizer = tf.keras.optimizers.Adam(lr=0.0005) self.batch_size = 128 self.n_samples = 10 self.update_period = 4 self.gamma = 0.99 self.target_policy_update_period = 400 self.target_critic_update_period = 400 self.global_steps = 0 self.episode_count = 0 self.setup()
def __init__(self, game, agentsTypes, agent_index, parameters=None, render=False, use_replay=False, deep=0, monitor=False): # Create an instance of the network itself, as well as the memory. # Here is also a good place to set environmental parameters, # as well as training parameters - number of episodes / iterations, etc. self.gamma = 0.99 self.RLalpha = 0.1 self.SLalpha = 0.005 self.RLBufferSize = 1000 self.SLBufferSize = 50000 self.epsilon_initial = 0.5 self.epsilon = self.epsilon_initial self.episodes = 1000000 self.env = game.env self.agentsTypes = agentsTypes self.agent_index = agent_index self.state_size = self.env.state_size self.action_size = self.env.action_size self.eta = 0.1 self.deep = deep self.policynet = PNetwork(self.env, self, deep=deep) self.valuenet = QNetwork(self.env, self, deep=deep) self.target_update_period = 100 self.network_update_period = 128 self.network_updates = 2 self.iteration = 0 self.brp = True self.sigma = self.brp_action self.replayRL = IS_Replay_Memory(game, agentsTypes, self.agent_index, memory_size=self.RLBufferSize) #self.replayRL = Prioritized_Replay_Memory(game, memory_size=self.RLBufferSize) self.replaySL = Replay_Memory(game, memory_size=self.SLBufferSize, kind=replay.RESERVOIR)
def __init__(self, pid, env_name, epsilon, gamma=0.98): self.pid = pid self.env_name = env_name self.env = gym.make(self.env_name) self.action_space = self.env.action_space.n self.q_network = QNetwork(self.action_space) self.epsilon = epsilon self.gamma = gamma self.buffer = [] self.state = self.env.reset() self.setup() self.episode_rewards = 0
def __init__(self, env, render=False,model_type=None,save_folder=None): self.net=QNetwork(env,model_type=model_type) self.obs_space=env.observation_space.shape[0] self.ac_space=env.action_space.n self.render=render ######################Hyperparameters########################### self.env=env self.epsilon=0.7 self.epsilon_min=0.05 self.epsilon_decay=0.999 self.gamma=0.99 self.max_itr=1000000 self.batch_size=32 self.max_reward=160 #Used for saving a model with a reward above a certain threshold self.memory_queue=Replay_Memory(memory_size=50000, burn_in=30000) ############################################################### self.avg_rew_buffer=10 self.avg_rew_queue=deque(maxlen=self.avg_rew_buffer) self.model_save=50 self.test_model_interval=50 self.save_folder=save_folder
def __init__(self, observation_space, action_space, args): """ Constructor :param observation_space: observation space of the environment :param action_space: action space of the environment :param args: command line args to set hyperparameters """ # set hyperparameters self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.gamma = args.gamma self.state_dim = observation_space.shape[0] self.action_dim = action_space.shape[0] self.hidden_dim = args.hidden_units self.tau = args.tau self.lr = args.lr self.target_update_interval = args.target_update_interval # build and initialize networks self.q_net_1 = QNetwork(self.state_dim, self.action_dim, self.hidden_dim).to(self.device) self.q_net_2 = QNetwork(self.state_dim, self.action_dim, self.hidden_dim).to(self.device) self.target_q_net_1 = QNetwork(self.state_dim, self.action_dim, self.hidden_dim).to(self.device) self.target_q_net_2 = QNetwork(self.state_dim, self.action_dim, self.hidden_dim).to(self.device) hard_update(self.q_net_1, self.target_q_net_1) hard_update(self.q_net_2, self.target_q_net_2) self.policy_net = PolicyNetwork(self.state_dim, self.action_dim, self.hidden_dim, self.device).to(self.device) # build criterions and optimizers self.q1_criterion = nn.MSELoss() self.q2_criterion = nn.MSELoss() self.q1_optim = optim.Adam(self.q_net_1.parameters(), lr=self.lr) self.q2_optim = optim.Adam(self.q_net_2.parameters(), lr=self.lr) self.policy_optim = optim.Adam(self.policy_net.parameters(), lr=self.lr) # for optimizing alpha (see Harnojaa et al. section 5) if args.initial_alpha is not None: self.alpha = torch.tensor(args.initial_alpha, requires_grad=True, device=self.device, dtype=torch.float) else: self.alpha = torch.rand(1, requires_grad=True, device=self.device, dtype=torch.float) if args.entropy_target is not None: self.entropy_target = torch.tensor(args.target_alpha, device=self.device, dtype=torch.float) else: self.entropy_target = -1. * torch.tensor( action_space.shape, device=self.device, dtype=torch.float) self.alpha_optim = optim.Adam([self.alpha], lr=self.lr)
class SAC: """ A class used to represent a SAC agent Attributes ---------- device : cuda or cpu the device on which all the computation occurs gamma : float[0,1] discount factor state_dim : int dimension of the environment observation space action_dim : int dimension of the environment action space hidden_dim : int dimension of the hidden layers of the networks tau : float[0,1] coefficient of soft update of target networks lr : float learning rate of the optimizers target_update_interval : int number of updates in between soft updates of target networks q_net_1 : QNetwork soft Q value network 1 q_net_2 : QNetwork soft Q value network 2 target_q_net_1 : QNetwork target Q value network 1 target_q_net_2 : QNetwork target Q value network 2 policy_net : PolicyNetwork policy network q1_criterion : torch optimization criterion for q_net_1 q2_criterion : torch optimization criterion for q_net_2 q1_optim : torch optimizer for q_net_1 q2_optim : torch optimizer for q_net_2 policy_optim : torch optimizer for policy_net alpha : torch float scalar entropy temperature (controls policy stochasticity) entropy_target : torch float scalar entropy target for the environment (see Haarnoja et al. Section 5) Methods ------- update(replay_buffer, batch_size, updates) : q1_loss, q2_loss, policy_loss, alpha_loss Performs a gradient step of the algorithm, optimizing Q networks and policy network and optimizing alpha choose_action(state) : action Returns the appropriate action in given state according to current policy save_networks_parameters(params_dir) Saves the relevant parameters (q1_net's, q2_net's, policy_net's, alpha) from the networks load_networks_parameters(params_dir) Loads the relevant parameters (q1_net's, q2_net's, policy_net's, alpha) into the networks """ def __init__(self, observation_space, action_space, args): """ Constructor :param observation_space: observation space of the environment :param action_space: action space of the environment :param args: command line args to set hyperparameters """ # set hyperparameters self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.gamma = args.gamma self.state_dim = observation_space.shape[0] self.action_dim = action_space.shape[0] self.hidden_dim = args.hidden_units self.tau = args.tau self.lr = args.lr self.target_update_interval = args.target_update_interval # build and initialize networks self.q_net_1 = QNetwork(self.state_dim, self.action_dim, self.hidden_dim).to(self.device) self.q_net_2 = QNetwork(self.state_dim, self.action_dim, self.hidden_dim).to(self.device) self.target_q_net_1 = QNetwork(self.state_dim, self.action_dim, self.hidden_dim).to(self.device) self.target_q_net_2 = QNetwork(self.state_dim, self.action_dim, self.hidden_dim).to(self.device) hard_update(self.q_net_1, self.target_q_net_1) hard_update(self.q_net_2, self.target_q_net_2) self.policy_net = PolicyNetwork(self.state_dim, self.action_dim, self.hidden_dim, self.device).to(self.device) # build criterions and optimizers self.q1_criterion = nn.MSELoss() self.q2_criterion = nn.MSELoss() self.q1_optim = optim.Adam(self.q_net_1.parameters(), lr=self.lr) self.q2_optim = optim.Adam(self.q_net_2.parameters(), lr=self.lr) self.policy_optim = optim.Adam(self.policy_net.parameters(), lr=self.lr) # for optimizing alpha (see Harnojaa et al. section 5) if args.initial_alpha is not None: self.alpha = torch.tensor(args.initial_alpha, requires_grad=True, device=self.device, dtype=torch.float) else: self.alpha = torch.rand(1, requires_grad=True, device=self.device, dtype=torch.float) if args.entropy_target is not None: self.entropy_target = torch.tensor(args.target_alpha, device=self.device, dtype=torch.float) else: self.entropy_target = -1. * torch.tensor( action_space.shape, device=self.device, dtype=torch.float) self.alpha_optim = optim.Adam([self.alpha], lr=self.lr) def update(self, replay_buffer, batch_size, updates): """ Performs a gradient step of the algorithm, optimizing Q networks and policy network and optimizing alpha :param replay_buffer: replay buffer to sample batches of transitions from :param batch_size: size of the batches :param updates: number of updates so far :return: losses of the four optimizers (q1_optim, q2_optim, policy_optim, alpha_optim) :rtype: tuple of torch scalar floats """ # sample a transition batch from replay buffer and cast it to tensor of the correct shape state_batch, action_batch, reward_batch, next_state_batch, done_batch = replay_buffer.sample( batch_size) state_batch = torch.from_numpy(state_batch).to(self.device, dtype=torch.float) next_state_batch = torch.from_numpy(next_state_batch).to( self.device, dtype=torch.float) action_batch = torch.from_numpy(action_batch).to(self.device, dtype=torch.float) reward_batch = torch.from_numpy(reward_batch).unsqueeze(1).to( self.device, dtype=torch.float) done_batch = torch.from_numpy(np.float32(done_batch)).unsqueeze(1).to( self.device, dtype=torch.float) # sample actions from the policy to be used for expectations updates sampled_action, log_prob, epsilon, mean, log_std = self.policy_net.sample( state_batch) ### evaluation step target_next_value = torch.min( self.target_q_net_1(next_state_batch, sampled_action), self.target_q_net_2(next_state_batch, sampled_action)) - self.alpha * log_prob current_q_value_1 = self.q_net_1(state_batch, action_batch) current_q_value_2 = self.q_net_2(state_batch, action_batch) expected_next_value = reward_batch + ( 1 - done_batch) * self.gamma * target_next_value q1_loss = self.q1_criterion(current_q_value_1, expected_next_value.detach()) q2_loss = self.q2_criterion(current_q_value_2, expected_next_value.detach()) # optimize q1 and q1 nets self.q1_optim.zero_grad() q1_loss.backward() self.q1_optim.step() self.q2_optim.zero_grad() q2_loss.backward() self.q2_optim.step() ### improvement step sampled_q_value = torch.min(self.q_net_1(state_batch, sampled_action), self.q_net_2(state_batch, sampled_action)) policy_loss = (self.alpha * log_prob - sampled_q_value).mean() # optimize policy net self.policy_optim.zero_grad() policy_loss.backward() self.policy_optim.step() # optimize alpha alpha_loss = (self.alpha * (-log_prob - self.entropy_target).detach()).mean() self.alpha_optim.zero_grad() alpha_loss.backward() self.alpha_optim.step() # update Q target value if updates % self.target_update_interval == 0: soft_update(self.q_net_1, self.target_q_net_1, self.tau) soft_update(self.q_net_2, self.target_q_net_2, self.tau) return q1_loss.item(), q2_loss.item(), policy_loss.item( ), alpha_loss.item() def choose_action(self, state): """ Returns the appropriate action in given state according to current policy :param state: state :return: action :rtype numpy float array """ action = self.policy_net.get_action(state) # move to cpu, remove from gradient graph, cast to numpy return action.cpu().detach().numpy() def save_networks_parameters(self, params_dir=None): """ Saves the relevant parameters (q1_net's, q2_net's, policy_net's, alpha) from the networks :param params_dir: directory where to save parameters to (optional) :return: None """ if params_dir is None: params_dir = "SavedAgents/" # create a subfolder with current timestamp prefix = os.path.join( params_dir, datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")) if not os.path.exists(prefix): os.makedirs(prefix) policy_path = os.path.join(prefix, "policy_net_params") q1_path = os.path.join(prefix, "q1_net_params") q2_path = os.path.join(prefix, "q2_net_params") alpha_path = os.path.join(prefix, "alpha_param") print("Saving parameters to {}, {}, {}".format(q1_path, q2_path, policy_path)) torch.save(self.q_net_1.state_dict(), q1_path) torch.save(self.q_net_2.state_dict(), q2_path) torch.save(self.policy_net.state_dict(), policy_path) torch.save(self.alpha, alpha_path) return params_dir def load_networks_parameters(self, params_dir): """ Loads the relevant parameters (q1_net's, q2_net's, policy_net's, alpha) into the networks :param params_dir: directory where to load parameters from :return: None """ if params_dir is not None: print("Loading parameters from {}".format(params_dir)) policy_path = os.path.join(params_dir, "policy_net_params") self.policy_net.load_state_dict(torch.load(policy_path)) q1_path = os.path.join(params_dir, "q1_net_params") q2_path = os.path.join(params_dir, "q2_net_params") self.q_net_1.load_state_dict(torch.load(q1_path)) self.q_net_2.load_state_dict(torch.load(q2_path)) alpha_path = os.path.join(params_dir, "alpha_param") self.alpha = torch.load(alpha_path)
class DQN_Agent(): # In this class, we will implement functions to do the following. # (1) Create an instance of the Q Network class. # (2) Create a function that constructs a policy from the Q values predicted by the Q Network. # (a) Epsilon Greedy Policy. # (b) Greedy Policy. # (3) Create a function to train the Q Network, by interacting with the environment. # (4) Create a function to test the Q Network's performance on the environment. # (5) Create a function for Experience Replay. def __init__(self, game, agentsTypes, agent_index, parameters=None, render=False, use_replay=False, deep=0, monitor=False): # Create an instance of the network itself, as well as the memory. # Here is also a good place to set environmental parameters, # as well as training parameters - number of episodes / iterations, etc. self.gamma = 0.99 self.RLalpha = 0.1 self.SLalpha = 0.005 self.RLBufferSize = 1000 self.SLBufferSize = 50000 self.epsilon_initial = 0.5 self.epsilon = self.epsilon_initial self.episodes = 1000000 self.env = game.env self.agentsTypes = agentsTypes self.agent_index = agent_index self.state_size = self.env.state_size self.action_size = self.env.action_size self.eta = 0.1 self.deep = deep self.policynet = PNetwork(self.env, self, deep=deep) self.valuenet = QNetwork(self.env, self, deep=deep) self.target_update_period = 100 self.network_update_period = 128 self.network_updates = 2 self.iteration = 0 self.brp = True self.sigma = self.brp_action self.replayRL = IS_Replay_Memory(game, agentsTypes, self.agent_index, memory_size=self.RLBufferSize) #self.replayRL = Prioritized_Replay_Memory(game, memory_size=self.RLBufferSize) self.replaySL = Replay_Memory(game, memory_size=self.SLBufferSize, kind=replay.RESERVOIR) # q_values: State * Action -> Value def brp_action(self, state): if random.random() < self.epsilon: action = random.randint(0, self.action_size - 1) return action else: best_action, _ = self.valuenet.best_action(state) return best_action # greedy policy def average_policy_action(self, state): best_action = self.policynet.best_action(state) return best_action def resetepisode(self, average_only=False): if not average_only and random.random() < self.eta: self.brp = True self.sigma = self.brp_action else: self.brp = False self.sigma = self.average_policy_action def act(self, state): action = self.sigma(state) return action def updatereplay(self, state, action, reward, next_state, done, actionset, stateset): # See paper for recommended epsilon decay self.epsilon = self.epsilon_initial / math.ceil(math.sqrt((self.iteration + 1) / 10000)) self.iteration += 1 if self.brp: action_onehot = [0 for _ in range(self.action_size)] action_onehot[action] = 1 self.replaySL.append([state, action_onehot]) else: # If we are using Importance Sampling Experience Replay, # the last two fields are the likelihood at the time the item was added # and the current opponent likelihood # These fields should be ignored for Normal Exp Replay self.replayRL.append([state, action, reward, next_state, done, actionset, stateset]) if self.iteration % self.network_update_period == 0: for i in range(self.network_updates): batch = self.network_update_period # replayRLbatch = self.replayRL.sample_batch(self,batch_size=batch) replayRLbatch, batch_importance_weights = self.replayRL.sample_batch(batch) # should use mse loss, just have # action_size results self.valuenet.update_batch(batch, replayRLbatch, batch_importance_weights) if self.replaySL.size >= 1000: replaySLbatch = self.replaySL.sample_batch(batch) # should use crossentropy loss, softmax activation self.policynet.update_batch(batch, replaySLbatch) if self.iteration % self.target_update_period == 0: self.valuenet.update_target() def appendreplay(self, state, action, reward, next_state, done, actionset, stateset): self.replayRL.append([state, action, reward, next_state, done, actionset, stateset]) # Not Essential def surveySLMemory(self): freqs = [0 for i in range(self.action_size)] for item in self.replaySL.cache[:self.replaySL.size]: for j in range(self.action_size): if item[1][j] == 1: freqs[j] += 1 print(freqs) # Not Essential def surveyRLMemory(self): for item in self.replayRL.cache[:self.replayRL.size]: print(item)
class DQN_Agent(): def __init__(self, env, render=False,model_type=None,save_folder=None): self.net=QNetwork(env,model_type=model_type) self.obs_space=env.observation_space.shape[0] self.ac_space=env.action_space.n self.render=render ######################Hyperparameters########################### self.env=env self.epsilon=0.7 self.epsilon_min=0.05 self.epsilon_decay=0.999 self.gamma=0.99 self.max_itr=1000000 self.batch_size=32 self.max_reward=160 #Used for saving a model with a reward above a certain threshold self.memory_queue=Replay_Memory(memory_size=50000, burn_in=30000) ############################################################### self.avg_rew_buffer=10 self.avg_rew_queue=deque(maxlen=self.avg_rew_buffer) self.model_save=50 self.test_model_interval=50 self.save_folder=save_folder def epsilon_greedy_policy(self, q_values,epsi): # Creating epsilon greedy probabilities to sample from. if random.uniform(0,1)<=epsi: return random.randint(0,self.ac_space-1) #Q-Values shape is batch_size x ac else: return np.argmax(q_values[0]) def greedy_policy(self, q_values): # Creating greedy policy for test time. return np.argmax(q_values[0]) def train(self): testX,testY=[],[] batch_size,max_,avg_rew_test,itr=self.batch_size,0,0,0 print("Using Experience Replay") #Burn In self.burn_in_memory() if(self.save_folder!=None): self.env=Monitor(self.env, self.save_folder,video_callable=lambda episode_id:episode_id%500==0,force=True) for epi in range(EPISODES): state=np.reshape(self.env.reset(),[1,self.obs_space])#Reset the state total_rew=0 while True: itr+=1 if(self.render): self.env.render() #get action by e-greedy ac=self.epsilon_greedy_policy(self.net.model.predict(state),self.epsilon) #Find out next state and rew for current action n_s,rew,is_t, _ = self.env.step(ac) #Append to queue n_s=np.reshape(n_s,[1,self.obs_space]) self.memory_queue.append([state,ac,rew,is_t,n_s]) #Get samples of size batch_size batch=self.memory_queue.sample_batch(batch_size=batch_size) #Create array of states and next states batch_states=np.zeros((len(batch),self.obs_space)) batch_next_states=np.zeros((len(batch),self.obs_space)) actions,rewards,terminals=[],[],[] for i in range(0,len(batch)): b_state, b_ac, b_rew, b_is_t, b_ns=batch[i] #Returns already reshaped b_state and b_ns batch_states[i]=b_state batch_next_states[i]=b_ns actions.append(b_ac) rewards.append(b_rew) terminals.append(b_is_t) #Get Predictions batch_q_values=self.net.model.predict(batch_states) batch_next_q_values=self.net.model.predict(batch_next_states) for i in range(0,len(batch)): if terminals[i]: #Corresponds to is_terminal in sampled batch batch_q_values[i][actions[i]]=rewards[i] else: #If not batch_q_values[i][actions[i]]=rewards[i]+self.gamma*(np.amax(batch_next_q_values[i])) #Perform one step of SGD self.net.model.fit(batch_states,batch_q_values,batch_size=batch_size,epochs=1,verbose=0) self.epsilon*=self.epsilon_decay self.epsilon=max(self.epsilon,self.epsilon_min) total_rew+=rew state=n_s if is_t: break #test model at intervals if((epi+1)%self.test_model_interval==0): testX.append(epi) avg_rew_test=self.test() testY.append(avg_rew_test) #Remove and add rewards to calculate avg reward if(len(self.avg_rew_queue)>self.avg_rew_buffer): self.avg.rew_queue.popleft() self.avg_rew_queue.append(total_rew) avg_rew=sum(self.avg_rew_queue)/len(self.avg_rew_queue) ######################SAVING SECTION############################### #Save at intervals #if((epi+1)%self.model_save==0): # self.net.model.save('CartPole_linearwExpReplay_{}.h5'.format(epi)) if max_<avg_rew_test and avg_rew_test>self.max_reward: #self.net.model.save('CartPole_linear_comp_8.h5') max_=avg_rew_test ###################################################################### print(epi,itr,avg_rew,total_rew) plot_eval(testX,testY) #Plotting after episodes are done def test(self, model_file=None): test_episodes=20 rewards=[] if(model_file!=None): self.net.load_model(model_file) for e in range(test_episodes): state = np.reshape(self.env.reset(),[1,self.obs_space]) time_steps = 0 total_reward_per_episode = 0 while True: if(self.render): self.env.render() action = self.epsilon_greedy_policy(self.net.model.predict(state),0.05) next_state, reward, is_t, _ = self.env.step(action) next_state=np.reshape(next_state,[1,self.obs_space]) state = next_state total_reward_per_episode+=reward time_steps+=1 if is_t: break rewards.append(total_reward_per_episode) print("Total Reward for Episode {} is {}".format(e,total_reward_per_episode)) avg_rewards_=np.mean(np.array(rewards)) std_dev=np.std(rewards) print("AvgRew={},Std={}".format(avg_rewards_,std_dev)) return avg_rewards_ def burn_in_memory(self): # Initialize replay memory with a burn_in number of episodes / transitions. memory_size=0 state=np.reshape(self.env.reset(),[1,self.obs_space]) while(memory_size<self.memory_queue.burn_in): ac=random.randint(0,self.ac_space-1) n_s,rew,is_t,_=self.env.step(ac) n_s=np.reshape(n_s,[1,self.obs_space]) transition=[state,ac,rew,is_t,n_s] self.memory_queue.append(transition) state=n_s if is_t: state=np.reshape(self.env.reset(),[1,self.obs_space]) memory_size+=1 print("Burned Memory Queue")
class MPOAgent: def __init__(self, env_id: str, logdir: Path): self.env_id = env_id self.summary_writer = tf.summary.create_file_writer( str(logdir)) if logdir else None self.action_space = gym.make(self.env_id).action_space.shape[0] self.replay_buffer = ReplayBuffer(maxlen=10000) self.policy = GaussianPolicyNetwork(action_space=self.action_space) self.target_policy = GaussianPolicyNetwork( action_space=self.action_space) self.critic = QNetwork() self.target_critic = QNetwork() self.log_temperature = tf.Variable(1.) self.log_alpha_mu = tf.Variable(1.) self.log_alpha_sigma = tf.Variable(1.) self.eps = 0.1 self.eps_mu = 0.01 self.eps_sigma = 0.001 self.policy_optimizer = tf.keras.optimizers.Adam(lr=0.0005) self.critic_optimizer = tf.keras.optimizers.Adam(lr=0.0005) self.temperature_optimizer = tf.keras.optimizers.Adam(lr=0.0005) self.alpha_optimizer = tf.keras.optimizers.Adam(lr=0.0005) self.batch_size = 128 self.n_samples = 10 self.update_period = 4 self.gamma = 0.99 self.target_policy_update_period = 400 self.target_critic_update_period = 400 self.global_steps = 0 self.episode_count = 0 self.setup() def setup(self): """ Initialize network weights """ env = gym.make(self.env_id) dummy_state = env.reset() dummy_state = (dummy_state[np.newaxis, ...]).astype(np.float32) dummy_action = np.random.normal(0, 0.1, size=self.action_space) dummy_action = (dummy_action[np.newaxis, ...]).astype(np.float32) self.policy(dummy_state) self.target_policy(dummy_state) self.critic(dummy_state, dummy_action) self.target_critic(dummy_state, dummy_action) self.target_policy.set_weights(self.policy.get_weights()) self.target_critic.set_weights(self.critic.get_weights()) def save(self, save_dir): save_dir = Path(save_dir) self.policy.save_weights(str(save_dir / "policy")) self.critic.save_weights(str(save_dir / "critic")) def load(self, load_dir=None): load_dir = Path(load_dir) self.policy.load_weights(str(load_dir / "policy")) self.target_policy.load_weights(str(load_dir / "policy")) self.critic.load_weights(str(load_dir / "critic")) self.target_critic.load_weights(str(load_dir / "critic")) def rollout(self): episode_rewards, episode_steps = 0, 0 done = False env = gym.make(self.env_id) state = env.reset() while not done: action = self.policy.sample_action(np.atleast_2d(state)) action = action.numpy()[0] try: next_state, reward, done, _ = env.step(action) except Exception as err: print(err) import pdb pdb.set_trace() #: Bipedalwalkerの転倒ペナルティ-100は大きすぎるためclip transition = Transition(state, action, np.clip(reward, -1., 1.), next_state, done) self.replay_buffer.add(transition) state = next_state episode_rewards += reward episode_steps += 1 self.global_steps += 1 if (len(self.replay_buffer) >= 5000 and self.global_steps % self.update_period == 0): self.update_networks() if self.global_steps % self.target_critic_update_period == 0: self.target_critic.set_weights(self.critic.get_weights()) if self.global_steps % self.target_policy_update_period == 0: self.target_policy.set_weights(self.policy.get_weights()) self.episode_count += 1 with self.summary_writer.as_default(): tf.summary.scalar("episode_reward_stp", episode_rewards, step=self.global_steps) tf.summary.scalar("episode_steps_stp", episode_steps, step=self.global_steps) tf.summary.scalar("episode_reward", episode_rewards, step=self.episode_count) tf.summary.scalar("episode_steps", episode_steps, step=self.episode_count) return episode_rewards, episode_steps def update_networks(self): (states, actions, rewards, next_states, dones) = self.replay_buffer.get_minibatch(batch_size=self.batch_size) B, M = self.batch_size, self.n_samples # [B, obs_dim] -> [B, obs_dim * M] -> [B * M, obs_dim] next_states_tiled = tf.reshape(tf.tile(next_states, multiples=(1, M)), shape=(B * M, -1)) target_mu, target_sigma = self.target_policy(next_states_tiled) # For MultivariateGaussianPolicy #target_dist = tfd.MultivariateNormalFullCovariance(loc=target_mu, covariance_matrix=target_sigma) # For IndependentGaussianPolicy target_dist = tfd.Independent(tfd.Normal(loc=target_mu, scale=target_sigma), reinterpreted_batch_ndims=1) sampled_actions = target_dist.sample() # [B * M, action_dim] #sampled_actions = tf.clip_by_value(sampled_actions, -1.0, 1.0) # Update Q-network: sampled_qvalues = tf.reshape(self.target_critic( next_states_tiled, sampled_actions), shape=(B, M, -1)) mean_qvalues = tf.reduce_mean(sampled_qvalues, axis=1) TQ = rewards + self.gamma * (1.0 - dones) * mean_qvalues with tf.GradientTape() as tape1: Q = self.critic(states, actions) loss_critic = tf.reduce_mean(tf.square(TQ - Q)) variables = self.critic.trainable_variables grads = tape1.gradient(loss_critic, variables) grads, _ = tf.clip_by_global_norm(grads, 40.) self.critic_optimizer.apply_gradients(zip(grads, variables)) # E-step: # Obtain η* by minimising g(η) with tf.GradientTape() as tape2: temperature = tf.math.softplus(self.log_temperature) q_logsumexp = tf.math.reduce_logsumexp(sampled_qvalues / temperature, axis=1) loss_temperature = temperature * ( self.eps + tf.reduce_mean(q_logsumexp, axis=0)) grad = tape2.gradient(loss_temperature, self.log_temperature) if tf.math.is_nan(grad).numpy().sum() != 0: print("NAN GRAD in TEMPERATURE !!!!!!!!!") import pdb pdb.set_trace() else: self.temperature_optimizer.apply_gradients([ (grad, self.log_temperature) ]) # Obtain sample-based variational distribution q(a|s) temperature = tf.math.softplus(self.log_temperature) # M-step: Optimize the lower bound J with respect to θ weights = tf.squeeze(tf.math.softmax(sampled_qvalues / temperature, axis=1), axis=2) # [B, M, 1] if tf.math.is_nan(weights).numpy().sum() != 0: print("NAN in weights !!!!!!!!!") import pdb pdb.set_trace() with tf.GradientTape(persistent=True) as tape3: online_mu, online_sigma = self.policy(next_states_tiled) # For MultivariateGaussianPolicy #online_dist = tfd.MultivariateNormalFullCovariance(loc=online_mu, covariance_matrix=online_sigma) # For IndependentGaussianPolicy online_dist = tfd.Independent(tfd.Normal(loc=online_mu, scale=online_sigma), reinterpreted_batch_ndims=1) log_probs = tf.reshape(online_dist.log_prob(sampled_actions) + 1e-6, shape=(B, M)) # [B * M, ] -> [B, M] cross_entropy_qp = tf.reduce_sum(weights * log_probs, axis=1) # [B, M] -> [B,] # For MultivariateGaussianPolicy # online_dist_fixedmu = tfd.MultivariateNormalFullCovariance(loc=target_mu, covariance_matrix=online_sigma) # online_dist_fixedsigma = tfd.MultivariateNormalFullCovariance(loc=online_mu, covariance_matrix=target_sigma) # For IndependentGaussianPolicy online_dist_fixedmu = tfd.Independent(tfd.Normal( loc=target_mu, scale=online_sigma), reinterpreted_batch_ndims=1) online_dist_fixedsigma = tfd.Independent( tfd.Normal(loc=online_mu, scale=target_sigma), reinterpreted_batch_ndims=1) kl_mu = tf.reshape( target_dist.kl_divergence(online_dist_fixedsigma), shape=(B, M)) # [B * M, ] -> [B, M] kl_sigma = tf.reshape( target_dist.kl_divergence(online_dist_fixedmu), shape=(B, M)) # [B * M, ] -> [B, M] alpha_mu = tf.math.softplus(self.log_alpha_mu) alpha_sigma = tf.math.softplus(self.log_alpha_sigma) loss_policy = -cross_entropy_qp # [B,] loss_policy += tf.stop_gradient(alpha_mu) * tf.reduce_mean(kl_mu, axis=1) loss_policy += tf.stop_gradient(alpha_sigma) * tf.reduce_mean( kl_sigma, axis=1) loss_policy = tf.reduce_mean(loss_policy) # [B,] -> [1] loss_alpha_mu = tf.reduce_mean( alpha_mu * tf.stop_gradient(self.eps_mu - tf.reduce_mean(kl_mu, axis=1))) loss_alpha_sigma = tf.reduce_mean( alpha_sigma * tf.stop_gradient(self.eps_sigma - tf.reduce_mean(kl_sigma, axis=1))) loss_alpha = loss_alpha_mu + loss_alpha_sigma variables = self.policy.trainable_variables grads = tape3.gradient(loss_policy, variables) grads, _ = tf.clip_by_global_norm(grads, 40.) self.policy_optimizer.apply_gradients(zip(grads, variables)) variables = [self.log_alpha_mu, self.log_alpha_sigma] grads = tape3.gradient(loss_alpha, variables) grads, _ = tf.clip_by_global_norm(grads, 40.) self.alpha_optimizer.apply_gradients(zip(grads, variables)) del tape3 with self.summary_writer.as_default(): tf.summary.scalar("loss_policy", loss_policy, step=self.global_steps) tf.summary.scalar("loss_critic", loss_critic, step=self.global_steps) tf.summary.scalar("sigma", tf.reduce_mean(online_sigma), step=self.global_steps) tf.summary.scalar("kl_mu", tf.reduce_mean(kl_mu), step=self.global_steps) tf.summary.scalar("kl_sigma", tf.reduce_mean(kl_sigma), step=self.global_steps) tf.summary.scalar("temperature", temperature, step=self.global_steps) tf.summary.scalar("alpha_mu", alpha_mu, step=self.global_steps) tf.summary.scalar("alpha_sigma", alpha_sigma, step=self.global_steps) tf.summary.scalar("replay_buffer", len(self.replay_buffer), step=self.global_steps) def testplay(self, name, monitor_dir): total_rewards = [] env = wrappers.RecordVideo(gym.make(self.env_id), video_folder=monitor_dir, step_trigger=lambda i: True, name_prefix=name) state = env.reset() done = False total_reward = 0 while not done: action = self.policy.sample_action(np.atleast_2d(state)) action = action.numpy()[0] next_state, reward, done, _ = env.step(action) total_reward += reward state = next_state total_rewards.append(total_reward) print(f"{name}", total_reward)
class Learner: def __init__(self, env_name, gamma=0.98): self.env_name = env_name self.action_space = gym.make(self.env_name).action_space.n self.q_network = QNetwork(self.action_space) self.target_q_network = QNetwork(self.action_space) self.gamma = gamma self.optimizer = tf.keras.optimizers.Adam(lr=0.001) self.setup() def setup(self): env = gym.make(self.env_name) state = env.reset() self.q_network(np.atleast_2d(state)) self.target_q_network(np.atleast_2d(state)) self.target_q_network.set_weights(self.q_network.get_weights()) def get_weights(self): current_weights = self.q_network.get_weights() return current_weights def update_network(self, minibatchs): indices_all = [] td_errors_all = [] losses = [] for (indices, weights, transitions) in minibatchs: states, actions, rewards, next_states, dones = zip(*transitions) states = np.vstack(states) actions = np.array(actions) rewards = np.vstack(rewards) next_states = np.vstack(next_states) dones = np.vstack(dones) next_qvalues = self.q_network(next_states) next_actions = tf.cast(tf.argmax(next_qvalues, axis=1), tf.int32) next_actions_onehot = tf.one_hot(next_actions, self.action_space) next_maxQ = tf.reduce_sum(next_qvalues * next_actions_onehot, axis=1, keepdims=True) TQ = rewards + self.gamma * (1 - dones) * next_maxQ with tf.GradientTape() as tape: qvalues = self.q_network(states) actions_onehot = tf.one_hot(actions, self.action_space) Q = tf.reduce_sum(qvalues * actions_onehot, axis=1, keepdims=True) td_errors = tf.square(TQ - Q) loss = tf.reduce_mean(weights * td_errors) grads = tape.gradient(loss, self.q_network.trainable_variables) grads, _ = tf.clip_by_global_norm(grads, 40.0) self.optimizer.apply_gradients( zip(grads, self.q_network.trainable_variables)) indices_all += indices td_errors_all += td_errors.numpy().flatten().tolist() losses.append(loss) loss_info = np.array(losses).mean() current_weights = self.q_network.get_weights() return current_weights, indices_all, td_errors_all, loss_info
class Actor: def __init__(self, pid, env_name, epsilon, gamma=0.98): self.pid = pid self.env_name = env_name self.env = gym.make(self.env_name) self.action_space = self.env.action_space.n self.q_network = QNetwork(self.action_space) self.epsilon = epsilon self.gamma = gamma self.buffer = [] self.state = self.env.reset() self.setup() self.episode_rewards = 0 def setup(self): env = gym.make(self.env_name) state = env.reset() self.q_network(np.atleast_2d(state)) def rollout(self, current_weights): #: グローバルQ関数と重みを同期 self.q_network.set_weights(current_weights) #: rollout 100step for _ in range(100): state = self.state action = self.q_network.sample_action(state, self.epsilon) next_state, reward, done, _ = self.env.step(action) self.episode_rewards += reward transition = (state, action, reward, next_state, done) self.buffer.append(transition) if done: #print(self.episode_rewards) self.state = self.env.reset() self.episode_rewards = 0 else: self.state = next_state #: 初期優先度の計算 states = np.vstack([transition[0] for transition in self.buffer]) actions = np.array([transition[1] for trainsition in self.buffer]) rewards = np.vstack([transition[2] for trainsition in self.buffer]) next_states = np.vstack([transition[3] for transition in self.buffer]) dones = np.vstack([transition[4] for transition in self.buffer]) next_qvalues = self.q_network(next_states) next_actions = tf.cast(tf.argmax(next_qvalues, axis=1), tf.int32) next_actions_onehot = tf.one_hot(next_actions, self.action_space) next_maxQ = tf.reduce_sum(next_qvalues * next_actions_onehot, axis=1, keepdims=True) TQ = rewards + self.gamma * (1 - dones) * next_maxQ qvalues = self.q_network(states) actions_onehot = tf.one_hot(actions, self.action_space) Q = tf.reduce_sum(qvalues * actions_onehot, axis=1, keepdims=True) td_errors = (TQ - Q).numpy().flatten() transitions = self.buffer self.buffer = [] return td_errors, transitions, self.pid def test_play(self, current_weights): self.q_network.set_weights(current_weights) env = gym.make(self.env_name) state = env.reset() episode_rewards = 0 done = False while not done: action = self.q_network.sample_action(state, self.epsilon) next_state, reward, done, _ = env.step(action) episode_rewards += reward state = next_state return episode_rewards
outputLength, args.hidden_layers_policy, policyActivations, nextObsPh, aPh, "Target", actionMeanScale=np.expand_dims( clip[1, :], 0), logStdInit=None, logStdTrainable=False, actionClip=clip) Q1 = QNetwork(sess, inputLength, outputLength, args.hidden_layers_q, qActivations, obsPh, aPh, hiddenLayerMergeWithAction, suffix="Orig1") # original Q network 1 Q2 = QNetwork(sess, inputLength, outputLength, args.hidden_layers_q, qActivations, obsPh, aPh, hiddenLayerMergeWithAction, suffix="Orig2") # original Q network 2 QAux1 = QNetwork( sess,