class Agent: def __init__(self, env, config, wt): self.C = config self.n_state = list(env.observation_space.shape) self.n_action = env.action_space.n self.epsilon = 0.99 self.lr = 1e-3 self.wt = wt self.buffer = ReplayBuffer(self.C['max_size'], self.C['frame_stack']) self.buffer2 = ReplayBuffer(self.C['max_size'], self.C['frame_stack']) self.net = Net(self.n_state, self.n_action, self.C, self.wt) #Random action during Practice def act_pre(self): a = np.random.randint(self.n_action) return a #Epsilon greedy action selection function def act(self, s): a = self.greedy_act( s) if np.random.random() > self.epsilon else np.random.randint( self.n_action) return a def greedy_act(self, s): return self.net.action(s) #Practice without recording experiences def practice(self): self.lr = 1e-3 #possible self.net.pre_train(self.buffer, self.lr) #Records experiences and calls training functions def record(self, s, a, r, d, it, pre): #Variable pre is used to differentiate practice from RL training. if pre: self.buffer.append(s, a, r, d) if it > self.C['pre_training_start']: if it % self.C['pre_train_freq'] == 0: self.lr = 1e-3 self.net.pre_train(self.buffer, self.lr) else: self.buffer.append(s, a, r, d) if it <= 5e5: self.epsilon = linear_interp(0, 5e5, it, 0.1, 1.0) else: self.epsilon = max(linear_interp(5e5, 10e6, it, 0.01, 0.1), 0.01) if it > self.C['training_start']: if it % self.C['train_freq'] == 0: self.lr = 1e-4 #Learning rate for RL training self.net.train(self.buffer, self.lr) if it % self.C['update_target_freq'] == 0: self.net.update_target_network()
class Agent: def __init__(self, env, config, wt): self.C = config self.n_state = list(env.observation_space.shape) self.n_action = env.action_space.n self.epsilon = 0.99 self.lr = 1e-3 self.wt = wt self.buffer = ReplayBuffer(self.C['max_size'], self.C['frame_stack']) self.buffer2 = ReplayBuffer(self.C['max_size'], self.C['frame_stack']) self.net = Net(self.n_state, self.n_action, self.C, self.wt) def act_pre(self): a = np.random.randint(self.n_action) return a def act(self, s): a = self.greedy_act( s) if np.random.random() > self.epsilon else np.random.randint( self.n_action) return a def greedy_act(self, s): return self.net.action(s) def record(self, s, a, r, d, it, pre): if pre: self.buffer.append(s, a, r, d) if it > self.C['pre_training_start']: if it % self.C['pre_train_freq'] == 0: self.lr = 1e-3 #possible self.net.pre_train(self.buffer, self.lr) else: self.buffer.append(s, a, r, d) if it <= 6e5: self.epsilon = linear_interp(0, 6e5, it, 0.1, 1.0) else: self.epsilon = max(linear_interp(6e5, 10e6, it, 0.01, 0.1), 0.01) if it > self.C['training_start']: if it % self.C['train_freq'] == 0: self.lr = 1e-4 self.net.train(self.buffer, self.lr) # print(Q) if it % self.C['update_target_freq'] == 0: self.net.update_target_network()