def __init__(self, meta_controller_experience_memory=None, lr=0.00025, alpha=0.95, eps=0.01, batch_size=32, gamma=0.99, num_options=12): # expereince replay memory self.meta_controller_experience_memory = meta_controller_experience_memory self.lr = lr # learning rate self.alpha = alpha # optimizer parameter self.eps = 0.01 # optimizer parameter self.gamma = 0.99 # BUILD MODEL USE_CUDA = torch.cuda.is_available() if torch.cuda.is_available() and torch.cuda.device_count() > 1: self.device = torch.device("cuda:1") elif torch.cuda.device_count() == 1: self.device = torch.device("cuda:0") else: self.device = torch.device("cpu") dfloat_cpu = torch.FloatTensor dfloat_gpu = torch.cuda.FloatTensor dlong_cpu = torch.LongTensor dlong_gpu = torch.cuda.LongTensor duint_cpu = torch.ByteTensor dunit_gpu = torch.cuda.ByteTensor dtype = torch.cuda.FloatTensor if torch.cuda.is_available( ) else torch.FloatTensor dlongtype = torch.cuda.LongTensor if torch.cuda.is_available( ) else torch.LongTensor duinttype = torch.cuda.ByteTensor if torch.cuda.is_available( ) else torch.ByteTensor self.dtype = dtype self.dlongtype = dlongtype self.duinttype = duinttype Q = DQN(in_channels=4, num_actions=num_options).type(dtype) Q_t = DQN(in_channels=4, num_actions=num_options).type(dtype) Q_t.load_state_dict(Q.state_dict()) Q_t.eval() for param in Q_t.parameters(): param.requires_grad = False Q = Q.to(self.device) Q_t = Q_t.to(self.device) self.batch_size = batch_size self.Q = Q self.Q_t = Q_t # optimizer optimizer = optim.RMSprop(Q.parameters(), lr=lr, alpha=alpha, eps=eps) self.optimizer = optimizer print('init: Meta Controller --> OK')
class ParallelNashAgent(): def __init__(self, env, id, args): super(ParallelNashAgent, self).__init__() self.id = id self.current_model = DQN(env, args).to(args.device) self.target_model = DQN(env, args).to(args.device) update_target(self.current_model, self.target_model) if args.load_model and os.path.isfile(args.load_model): self.load_model(model_path) self.epsilon_by_frame = epsilon_scheduler(args.eps_start, args.eps_final, args.eps_decay) self.replay_buffer = ParallelReplayBuffer(args.buffer_size) self.rl_optimizer = optim.Adam(self.current_model.parameters(), lr=args.lr) def save_model(self, model_path): torch.save(self.current_model.state_dict(), model_path + f'/{self.id}_dqn') torch.save(self.target_model.state_dict(), model_path + f'/{self.id}_dqn_target') def load_model(self, model_path, eval=False, map_location=None): self.current_model.load_state_dict( torch.load(model_path + f'/{self.id}_dqn', map_location=map_location)) self.target_model.load_state_dict( torch.load(model_path + f'/{self.id}_dqn_target', map_location=map_location)) if eval: self.current_model.eval() self.target_model.eval()
def run(): policy_net = DQN(num_channels, 19).cuda() target_net = DQN(num_channels, 19).cuda() optimizer = optim.Adam(policy_net.parameters(), LR) memory = Memory(50000) env = gym.make(ENV_NAME) env.make_interactive(port=6666, realtime=False) max_epi = 100 n_step = 2 update_period = 10 gamma = 0.99 total_steps = 0 epsilon = 0.95 endEpsilon = 0.01 stepDrop = (epsilon - endEpsilon) / max_epi for num_epi in range(max_epi): obs = env.reset() state = converter(ENV_NAME, obs).cuda() state = state.float() done = False total_reward = 0 steps = 0 if epsilon > endEpsilon: epsilon -= stepDrop while not done: steps += 1 total_steps += 1 a_out = policy_net.sample_action(state, epsilon) action_index = a_out action = make_19action(env, action_index) obs_prime, reward, done, info = env.step(action) total_reward += reward if done: print("%d episode is done" % num_epi) print("total rewards : %d " % total_reward) writer.add_scalar('Rewards/train', total_reward, num_epi) break state_prime = converter(ENV_NAME, obs_prime).cuda() append_sample(memory, policy_net, target_net, state, action_index, reward, state_prime, done) state = state_prime if memory.size() > 1000: update_network(policy_net, target_net, memory, 2, optimizer, total_steps) if total_steps % 2000 == 0: update_target(policy_net, target_net)
class DQNTrainer(): def __init__(self, env, args): super(DQNTrainer).__init__() self.model = DQN(env, args, Nash=False).to(args.device) self.target = DQN(env, args, Nash=False).to(args.device) self.replay_buffer = ReplayBuffer(args.buffer_size) self.optimizer = optim.Adam(self.model.parameters(), lr=args.lr) self.args = args def push(self, s, a, r, s_, d): self.replay_buffer.push(s, a, r, s_, np.float32(d)) def update(self): state, action, reward, next_state, done = self.replay_buffer.sample( self.args.batch_size) state = torch.FloatTensor(np.float32(state)).to(self.args.device) next_state = torch.FloatTensor(np.float32(next_state)).to( self.args.device) action = torch.LongTensor(action).to(self.args.device) reward = torch.FloatTensor(reward).to(self.args.device) done = torch.FloatTensor(done).to(self.args.device) # Q-Learning with target network q_values = self.model(state) target_next_q_values = self.target(next_state) q_value = q_values.gather(1, action.unsqueeze(1)).squeeze(1) next_q_value = target_next_q_values.max(1)[0] expected_q_value = reward + ( self.args.gamma**self.args.multi_step) * next_q_value * (1 - done) # Huber Loss loss = F.smooth_l1_loss(q_value, expected_q_value.detach(), reduction='none') loss = loss.mean() self.optimizer.zero_grad() loss.backward() self.optimizer.step() return loss.item() def act(self, s, args): return self.model.act(s, args) def save_model(self, model_path): torch.save(self.model.state_dict(), model_path + 'dqn') torch.save(self.target.state_dict(), model_path + 'dqn_target')
def train_setting(env, device): init_screen = get_screen(env, device) _, _, screen_height, screen_width = init_screen.shape # Get number of actions from gym action space n_actions = env.action_space.n policy_net = DQN(screen_height, screen_width, n_actions).to(device) target_net = DQN(screen_height, screen_width, n_actions).to(device) target_net.load_state_dict(policy_net.state_dict()) target_net.eval() optimizer = optim.RMSprop(policy_net.parameters()) memory = ReplayMemory(10000) return n_actions, policy_net, target_net, optimizer, memory
class Agent(): def __init__(self, args, env): self.args = args self.action_space = env.action_space() self.atoms = args.atoms self.Vmin = args.V_min self.Vmax = args.V_max self.support = torch.linspace(args.V_min, args.V_max, self.atoms).to( device=args.device) # Support (range) of z self.delta_z = (args.V_max - args.V_min) / (self.atoms - 1) self.batch_size = args.batch_size self.n = args.multi_step self.discount = args.discount self.norm_clip = args.norm_clip self.coeff = 0.01 if args.game in [ 'pong', 'boxing', 'private_eye', 'freeway' ] else 1. self.online_net = DQN(args, self.action_space).to(device=args.device) self.momentum_net = DQN(args, self.action_space).to(device=args.device) # self.predictor = prediction_MLP(in_dim=128, hidden_dim=128, out_dim=128) if args.model: # Load pretrained model if provided if os.path.isfile(args.model): state_dict = torch.load( args.model, map_location='cpu' ) # Always load tensors onto CPU by default, will shift to GPU if necessary if 'conv1.weight' in state_dict.keys(): for old_key, new_key in (('conv1.weight', 'convs.0.weight'), ('conv1.bias', 'convs.0.bias'), ('conv2.weight', 'convs.2.weight'), ('conv2.bias', 'convs.2.bias'), ('conv3.weight', 'convs.4.weight'), ('conv3.bias', 'convs.4.bias')): state_dict[new_key] = state_dict[ old_key] # Re-map state dict for old pretrained models del state_dict[ old_key] # Delete old keys for strict load_state_dict self.online_net.load_state_dict(state_dict) print("Loading pretrained model: " + args.model) else: # Raise error if incorrect model path provided raise FileNotFoundError(args.model) self.online_net.train() # self.pred.train() self.initialize_momentum_net() self.momentum_net.train() self.target_net = DQN(args, self.action_space).to(device=args.device) self.update_target_net() self.target_net.train() for param in self.target_net.parameters(): param.requires_grad = False for param in self.momentum_net.parameters(): param.requires_grad = False self.optimiser = optim.Adam(self.online_net.parameters(), lr=args.learning_rate, eps=args.adam_eps) # Resets noisy weights in all linear layers (of online net only) def reset_noise(self): self.online_net.reset_noise() # Acts based on single state (no batch) def act(self, state): with torch.no_grad(): a, _, _ = self.online_net(state.unsqueeze(0)) return (a * self.support).sum(2).argmax(1).item() # Acts with an ε-greedy policy (used for evaluation only) def act_e_greedy( self, state, epsilon=0.001): # High ε can reduce evaluation scores drastically return np.random.randint( 0, self.action_space ) if np.random.random() < epsilon else self.act(state) def learn(self, mem): # Sample transitions idxs, states, actions, returns, next_states, nonterminals, weights = mem.sample( self.batch_size) # print('\n\n---------------') # print(f'idxs: {idxs}, ') # print(f'states: {states.shape}, ') # print(f'actions: {actions.shape}, ') # print(f'returns: {returns.shape}, ') # print(f'next_states: {next_states.shape}, ') # print(f'nonterminals: {nonterminals.shape}, ') # print(f'weights: {weights.shape},') aug_states_1 = aug(states).to(device=self.args.device) aug_states_2 = aug(states).to(device=self.args.device) # print(f'aug_states_1: {aug_states_1.shape}') # print(f'aug_states_2: {aug_states_2.shape}') # Calculate current state probabilities (online network noise already sampled) log_ps, _, _ = self.online_net( states, log=True) # Log probabilities log p(s_t, ·; θonline) _, z_1, p_1 = self.online_net(aug_states_1, log=True) _, z_2, p_2 = self.online_net(aug_states_2, log=True) # p_1, p_2 = self.pred(z_1), self.pred(z_2) # with torch.no_grad(): # p_2 = self.pred(z_2) simsiam_loss = 2 + D(p_1, z_2) / 2 + D(p_2, z_1) / 2 # simsiam_loss = p_1.mean() + p_2.mean() # simsiam_loss = p_1.mean() * 128 # simsiam_loss = - F.cosine_similarity(p_1, z_2.detach(), dim=-1).mean() # print(simsiam_loss) # simsiam_loss = 0 # _, z_target = self.momentum_net(aug_states_2, log=True) #z_k # z_proj = torch.matmul(self.online_net.W, z_target.T) # logits = torch.matmul(z_anch, z_proj) # logits = (logits - torch.max(logits, 1)[0][:, None]) # logits = logits * 0.1 # labels = torch.arange(logits.shape[0]).long().to(device=self.args.device) # moco_loss = (nn.CrossEntropyLoss()(logits, labels)).to(device=self.args.device) log_ps_a = log_ps[range(self.batch_size), actions] # log p(s_t, a_t; θonline) # print(f'z_1: {z_1.shape}') # print(f'p_1: {p_1.shape}') # print('---------------\n\n') # 1/0 with torch.no_grad(): # Calculate nth next state probabilities pns, _, _ = self.online_net( next_states) # Probabilities p(s_t+n, ·; θonline) dns = self.support.expand_as( pns) * pns # Distribution d_t+n = (z, p(s_t+n, ·; θonline)) argmax_indices_ns = dns.sum(2).argmax( 1 ) # Perform argmax action selection using online network: argmax_a[(z, p(s_t+n, a; θonline))] self.target_net.reset_noise() # Sample new target net noise pns, _, _ = self.target_net( next_states) # Probabilities p(s_t+n, ·; θtarget) pns_a = pns[range( self.batch_size ), argmax_indices_ns] # Double-Q probabilities p(s_t+n, argmax_a[(z, p(s_t+n, a; θonline))]; θtarget) # Compute Tz (Bellman operator T applied to z) Tz = returns.unsqueeze(1) + nonterminals * ( self.discount**self.n) * self.support.unsqueeze( 0) # Tz = R^n + (γ^n)z (accounting for terminal states) Tz = Tz.clamp(min=self.Vmin, max=self.Vmax) # Clamp between supported values # Compute L2 projection of Tz onto fixed support z b = (Tz - self.Vmin) / self.delta_z # b = (Tz - Vmin) / Δz l, u = b.floor().to(torch.int64), b.ceil().to(torch.int64) # Fix disappearing probability mass when l = b = u (b is int) l[(u > 0) * (l == u)] -= 1 u[(l < (self.atoms - 1)) * (l == u)] += 1 # Distribute probability of Tz m = states.new_zeros(self.batch_size, self.atoms) offset = torch.linspace(0, ((self.batch_size - 1) * self.atoms), self.batch_size).unsqueeze(1).expand( self.batch_size, self.atoms).to(actions) m.view(-1).index_add_( 0, (l + offset).view(-1), (pns_a * (u.float() - b)).view(-1)) # m_l = m_l + p(s_t+n, a*)(u - b) m.view(-1).index_add_( 0, (u + offset).view(-1), (pns_a * (b - l.float())).view(-1)) # m_u = m_u + p(s_t+n, a*)(b - l) loss = -torch.sum( m * log_ps_a, 1) # Cross-entropy loss (minimises DKL(m||p(s_t, a_t))) # loss = loss + (moco_loss * self.coeff) loss = loss + (simsiam_loss * self.coeff) self.online_net.zero_grad() # self.pred.zero_grad() curl_loss = (weights * loss).mean() # print(curl_loss) curl_loss.mean().backward( ) # Backpropagate importance-weighted minibatch loss clip_grad_norm_(self.online_net.parameters(), self.norm_clip) # Clip gradients by L2 norm self.optimiser.step() mem.update_priorities(idxs, loss.detach().cpu().numpy() ) # Update priorities of sampled transitions def learn_old(self, mem): # Sample transitions idxs, states, actions, returns, next_states, nonterminals, weights = mem.sample( self.batch_size) # print('\n\n---------------') # print(f'idxs: {idxs}, ') # print(f'states: {states.shape}, ') # print(f'actions: {actions.shape}, ') # print(f'returns: {returns.shape}, ') # print(f'next_states: {next_states.shape}, ') # print(f'nonterminals: {nonterminals.shape}, ') # print(f'weights: {weights.shape},') aug_states_1 = aug(states).to(device=self.args.device) aug_states_2 = aug(states).to(device=self.args.device) # print(f'aug_states_1: {aug_states_1.shape}') # print(f'aug_states_2: {aug_states_2.shape}') # Calculate current state probabilities (online network noise already sampled) log_ps, _, _ = self.online_net( states, log=True) # Log probabilities log p(s_t, ·; θonline) _, z_anch, _ = self.online_net(aug_states_1, log=True) #z_q _, z_target, _ = self.momentum_net(aug_states_2, log=True) #z_k z_proj = torch.matmul(self.online_net.W, z_target.T) logits = torch.matmul(z_anch, z_proj) logits = (logits - torch.max(logits, 1)[0][:, None]) logits = logits * 0.1 labels = torch.arange( logits.shape[0]).long().to(device=self.args.device) moco_loss = (nn.CrossEntropyLoss()(logits, labels)).to(device=self.args.device) log_ps_a = log_ps[range(self.batch_size), actions] # log p(s_t, a_t; θonline) # print(f'z_anch: {z_anch.shape}') # print(f'z_target: {z_target.shape}') # print(f'z_proj: {z_proj.shape}') # print(f'logits: {logits.shape}') # print(logits) # print(f'labels: {labels.shape}') # print(labels) # print('---------------\n\n') # 1/0 with torch.no_grad(): # Calculate nth next state probabilities pns, _, _ = self.online_net( next_states) # Probabilities p(s_t+n, ·; θonline) dns = self.support.expand_as( pns) * pns # Distribution d_t+n = (z, p(s_t+n, ·; θonline)) argmax_indices_ns = dns.sum(2).argmax( 1 ) # Perform argmax action selection using online network: argmax_a[(z, p(s_t+n, a; θonline))] self.target_net.reset_noise() # Sample new target net noise pns, _, _ = self.target_net( next_states) # Probabilities p(s_t+n, ·; θtarget) pns_a = pns[range( self.batch_size ), argmax_indices_ns] # Double-Q probabilities p(s_t+n, argmax_a[(z, p(s_t+n, a; θonline))]; θtarget) # Compute Tz (Bellman operator T applied to z) Tz = returns.unsqueeze(1) + nonterminals * ( self.discount**self.n) * self.support.unsqueeze( 0) # Tz = R^n + (γ^n)z (accounting for terminal states) Tz = Tz.clamp(min=self.Vmin, max=self.Vmax) # Clamp between supported values # Compute L2 projection of Tz onto fixed support z b = (Tz - self.Vmin) / self.delta_z # b = (Tz - Vmin) / Δz l, u = b.floor().to(torch.int64), b.ceil().to(torch.int64) # Fix disappearing probability mass when l = b = u (b is int) l[(u > 0) * (l == u)] -= 1 u[(l < (self.atoms - 1)) * (l == u)] += 1 # Distribute probability of Tz m = states.new_zeros(self.batch_size, self.atoms) offset = torch.linspace(0, ((self.batch_size - 1) * self.atoms), self.batch_size).unsqueeze(1).expand( self.batch_size, self.atoms).to(actions) m.view(-1).index_add_( 0, (l + offset).view(-1), (pns_a * (u.float() - b)).view(-1)) # m_l = m_l + p(s_t+n, a*)(u - b) m.view(-1).index_add_( 0, (u + offset).view(-1), (pns_a * (b - l.float())).view(-1)) # m_u = m_u + p(s_t+n, a*)(b - l) loss = -torch.sum( m * log_ps_a, 1) # Cross-entropy loss (minimises DKL(m||p(s_t, a_t))) print(moco_loss) loss = loss + (moco_loss * self.coeff) self.online_net.zero_grad() curl_loss = (weights * loss).mean() curl_loss.mean().backward( ) # Backpropagate importance-weighted minibatch loss clip_grad_norm_(self.online_net.parameters(), self.norm_clip) # Clip gradients by L2 norm self.optimiser.step() mem.update_priorities(idxs, loss.detach().cpu().numpy() ) # Update priorities of sampled transitions def update_target_net(self): self.target_net.load_state_dict(self.online_net.state_dict()) def initialize_momentum_net(self): for param_q, param_k in zip(self.online_net.parameters(), self.momentum_net.parameters()): param_k.data.copy_(param_q.data) # update param_k.requires_grad = False # not update by gradient # Code for this function from https://github.com/facebookresearch/moco @torch.no_grad() def update_momentum_net(self, momentum=0.999): for param_q, param_k in zip(self.online_net.parameters(), self.momentum_net.parameters()): param_k.data.copy_(momentum * param_k.data + (1. - momentum) * param_q.data) # update # Save model parameters on current device (don't move model between devices) def save(self, path, name='model.pth'): torch.save(self.online_net.state_dict(), os.path.join(path, name)) # Evaluates Q-value based on single state (no batch) def evaluate_q(self, state): with torch.no_grad(): a, _, _ = self.online_net(state.unsqueeze(0)) return (a * self.support).sum(2).max(1)[0].item() def train(self): self.online_net.train() def eval(self): self.online_net.eval()
class Agent(): def __init__(self, args, env): self.action_space = env.action_space() self.batch_size = args.batch_size self.discount = args.discount self.max_gradient_norm = args.max_gradient_norm self.policy_net = DQN(args, self.action_space) if args.model and os.path.isfile(args.model): self.policy_net.load_state_dict(torch.load(args.model)) self.policy_net.train() self.target_net = DQN(args, self.action_space) self.update_target_net() self.target_net.eval() self.optimiser = optim.Adam(self.policy_net.parameters(), lr=args.lr) def act(self, state, epsilon): if random.random() > epsilon: return self.policy_net(state.unsqueeze(0)).max(1)[1].data[0] else: return random.randint(0, self.action_space - 1) def learn(self, mem): transitions = mem.sample(self.batch_size) batch = Transition(*zip(*transitions)) # Transpose the batch states = Variable(torch.stack(batch.state, 0)) actions = Variable(torch.LongTensor(batch.action).unsqueeze(1)) rewards = Variable(torch.Tensor(batch.reward)) non_final_mask = torch.ByteTensor( tuple(map( lambda s: s is not None, batch.next_state))) # Only process non-terminal next states next_states = Variable( torch.stack(tuple(s for s in batch.next_state if s is not None), 0), volatile=True ) # Prevent backpropagating through expected action values Qs = self.policy_net(states).gather(1, actions) # Q(s_t, a_t; θpolicy) next_state_argmax_indices = self.policy_net(next_states).max( 1, keepdim=True )[1] # Perform argmax action selection using policy network: argmax_a[Q(s_t+1, a; θpolicy)] Qns = Variable(torch.zeros( self.batch_size)) # Q(s_t+1, a) = 0 if s_t+1 is terminal Qns[non_final_mask] = self.target_net(next_states).gather( 1, next_state_argmax_indices ) # Q(s_t+1, argmax_a[Q(s_t+1, a; θpolicy)]; θtarget) Qns.volatile = False # Remove volatile flag to prevent propagating it through loss target = rewards + ( self.discount * Qns ) # Double-Q target: Y = r + γ.Q(s_t+1, argmax_a[Q(s_t+1, a; θpolicy)]; θtarget) loss = F.smooth_l1_loss( Qs, target) # Huber loss on TD-error δ: δ = Y - Q(s_t, a_t) # TODO: TD-error clipping? self.policy_net.zero_grad() loss.backward() nn.utils.clip_grad_norm(self.policy_net.parameters(), self.max_gradient_norm) # Clamp gradients self.optimiser.step() def update_target_net(self): self.target_net.load_state_dict(self.policy_net.state_dict()) def save(self, path): torch.save(self.policy_net.state_dict(), os.path.join(path, 'model.pth')) def evaluate_q(self, state): return self.policy_net(state.unsqueeze(0)).max(1)[0].data[0] def train(self): self.policy_net.train() def eval(self): self.policy_net.eval()
class Agent(): def __init__(self, args, env): self.action_space = env.action_space() self.atoms = args.atoms self.Vmin = args.V_min self.Vmax = args.V_max self.support = torch.linspace(args.V_min, args.V_max, self.atoms).to( device=args.device) # Support (range) of z self.delta_z = (args.V_max - args.V_min) / (self.atoms - 1) self.batch_size = args.batch_size self.n = args.multi_step self.discount = args.discount self.online_net = DQN(args, self.action_space).to(device=args.device) if args.model and os.path.isfile(args.model): # Always load tensors onto CPU by default, will shift to GPU if necessary self.online_net.load_state_dict( torch.load(args.model, map_location='cpu')) self.online_net.train() self.target_net = DQN(args, self.action_space).to(device=args.device) self.update_target_net() self.target_net.train() for param in self.target_net.parameters(): param.requires_grad = False self.optimiser = optim.Adam(self.online_net.parameters(), lr=args.lr, eps=args.adam_eps) # Resets noisy weights in all linear layers (of online net only) def reset_noise(self): self.online_net.reset_noise() # Acts based on single state (no batch) def act(self, state): with torch.no_grad(): return (self.online_net(state.unsqueeze(0)) * self.support).sum(2).argmax(1).item() # Acts with an ε-greedy policy (used for evaluation only) def act_e_greedy( self, state, epsilon=0.001): # High ε can reduce evaluation scores drastically return random.randrange( self.action_space) if random.random() < epsilon else self.act( state) def learn(self, mem): # Sample transitions idxs, states, actions, returns, next_states, nonterminals, weights = mem.sample( self.batch_size) # Calculate current state probabilities (online network noise already sampled) log_ps = self.online_net( states, log=True) # Log probabilities log p(s_t, ·; θonline) log_ps_a = log_ps[range(self.batch_size), actions] # log p(s_t, a_t; θonline) with torch.no_grad(): # Calculate nth next state probabilities pns = self.online_net( next_states) # Probabilities p(s_t+n, ·; θonline) dns = self.support.expand_as( pns) * pns # Distribution d_t+n = (z, p(s_t+n, ·; θonline)) argmax_indices_ns = dns.sum(2).argmax( 1 ) # Perform argmax action selection using online network: argmax_a[(z, p(s_t+n, a; θonline))] self.target_net.reset_noise() # Sample new target net noise pns = self.target_net( next_states) # Probabilities p(s_t+n, ·; θtarget) pns_a = pns[range( self.batch_size ), argmax_indices_ns] # Double-Q probabilities p(s_t+n, argmax_a[(z, p(s_t+n, a; θonline))]; θtarget) # Compute Tz (Bellman operator T applied to z) Tz = returns.unsqueeze(1) + nonterminals * ( self.discount**self.n) * self.support.unsqueeze( 0) # Tz = R^n + (γ^n)z (accounting for terminal states) Tz = Tz.clamp(min=self.Vmin, max=self.Vmax) # Clamp between supported values # Compute L2 projection of Tz onto fixed support z b = (Tz - self.Vmin) / self.delta_z # b = (Tz - Vmin) / Δz l, u = b.floor().to(torch.int64), b.ceil().to(torch.int64) # Fix disappearing probability mass when l = b = u (b is int) l[(u > 0) * (l == u)] -= 1 u[(l < (self.atoms - 1)) * (l == u)] += 1 # Distribute probability of Tz m = states.new_zeros(self.batch_size, self.atoms) offset = torch.linspace(0, ((self.batch_size - 1) * self.atoms), self.batch_size).unsqueeze(1).expand( self.batch_size, self.atoms).to(actions) m.view(-1).index_add_( 0, (l + offset).view(-1), (pns_a * (u.float() - b)).view(-1)) # m_l = m_l + p(s_t+n, a*)(u - b) m.view(-1).index_add_( 0, (u + offset).view(-1), (pns_a * (b - l.float())).view(-1)) # m_u = m_u + p(s_t+n, a*)(b - l) loss = -torch.sum( m * log_ps_a, 1) # Cross-entropy loss (minimises DKL(m||p(s_t, a_t))) self.online_net.zero_grad() (weights * loss).mean().backward( ) # Backpropagate importance-weighted minibatch loss self.optimiser.step() mem.update_priorities( idxs, loss.detach()) # Update priorities of sampled transitions def update_target_net(self): self.target_net.load_state_dict(self.online_net.state_dict()) # Save model parameters on current device (don't move model between devices) def save(self, path): torch.save(self.online_net.state_dict(), os.path.join(path, 'model.pth')) # Evaluates Q-value based on single state (no batch) def evaluate_q(self, state): with torch.no_grad(): return (self.online_net(state.unsqueeze(0)) * self.support).sum(2).max(1)[0].item() def train(self): self.online_net.train() def eval(self): self.online_net.eval()
device0 = torch.device("cuda:0") else: device0 = torch.device("cpu") dtype = torch.cuda.FloatTensor if torch.cuda.is_available( ) else torch.FloatTensor dlongtype = torch.cuda.LongTensor if torch.cuda.is_available( ) else torch.LongTensor duinttype = torch.cuda.ByteTensor if torch.cuda.is_available( ) else torch.ByteTensor Qt = DQN(in_channels=5, num_actions=18).type(dtype) Qt_t = DQN(in_channels=5, num_actions=18).type(dtype) Qt_t.load_state_dict(Qt.state_dict()) Qt_t.eval() for param in Qt_t.parameters(): param.requires_grad = False if torch.cuda.device_count() > 0: Qt = nn.DataParallel(Qt).to(device0) Qt_t = nn.DataParallel(Qt_t).to(device0) batch_size = BATCH_SIZE * torch.cuda.device_count() else: batch_size = BATCH_SIZE # optimizer optimizer = optim.RMSprop(Qt.parameters(), lr=LEARNING_RATE, alpha=ALPHA, eps=EPS)
class DQNAgent: """ Interacts with and learns from the environment. Vanilla DQN. """ def __init__(self, state_size: int, action_size: int, seed: int): """ Initialize an Agent object. :param state_size: dimension of each state; :param action_size: dimension of each action; :param seed: random seed. """ self.state_size = state_size self.action_size = action_size random.seed(seed) # Q-Network self.network_local = DQN(state_size, action_size, seed).to(DEVICE) self.network_target = DQN(state_size, action_size, seed).to(DEVICE) self.optimizer = optim.Adam(self.network_local.parameters(), lr=LR) # Replay memory self.memory = ReplayBuffer(BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action: int, reward: float, next_state, done): """ Save experiences in the replay memory and check if it's time to learn. :param state: (array_like) current state; :param action: action taken; :param reward: reward received; :param next_state: (array_like) next state; :param done: terminal state indicator; int or bool. """ # Save experience in replay memory self.memory.push(state, action, reward, next_state, done) # Increment time step and compare it to the network update frequency self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # Check if there is enough samples in the memory to learn if len(self.memory) > BATCH_SIZE: # sample experiences from memory experiences = self.memory.sample() # learn from sampled experiences self.learn(experiences, GAMMA) def act(self, state, eps: float = 0.): """ Returns actions for given state as per current policy. :param state: (array_like) current state :param eps: epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(DEVICE) self.network_local.eval() with torch.no_grad(): action_values = self.network_local(state) self.network_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma: float): """ Update value parameters using given batch of experience tuples. :param experiences: (Tuple[torch.Tensor]) tuple of (s, a, r, s', done) tuples; :param gamma: discount factor. """ states, actions, rewards, next_states, dones = experiences # Get max predicted Q values (for next states) from target model Q_targets_next = self.network_target(next_states).detach().max( 1)[0].unsqueeze(1) # Compute Q targets for current states Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Get expected Q values from local model Q_expected = self.network_local(states).gather(1, actions) # Compute loss loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.network_local, self.network_target, TAU) @staticmethod def soft_update(local_model, target_model, tau: float): """ Soft update model parameters, θ_target = τ*θ_local + (1 - τ)*θ_target. :param local_model: (PyTorch model) weights will be copied from; :param target_model: (PyTorch model) weights will be copied to; :param tau: interpolation parameter. """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class DDQNAgent: def __init__(self, config: Config, training=True): self.config = config self.is_training = training self.buffer = ReplayBuffer(self.config.max_buff) self.model = DQN(self.config.state_shape, self.config.action_dim) self.target_model = DQN(self.config.state_shape, self.config.action_dim) self.target_model.load_state_dict(self.model.state_dict()) self.optim = Adam(self.model.parameters(), lr=self.config.learning_rate) self.model.cuda() self.target_model.cuda() def act(self, state, epsilon=None): if epsilon is None: epsilon = self.config.epsilon_min if random.random() > epsilon or not self.is_training: state = torch.tensor(state, dtype=torch.float).unsqueeze(0) state = state.cuda() q_value = self.model.forward(state) action = q_value.max(1)[1].item() else: action = random.randrange(self.config.action_dim) return action def learn(self, t): s, a, r, s2, done = self.buffer.sample(self.config.batch_size) s = torch.tensor(s, dtype=torch.float) a = torch.tensor(a, dtype=torch.long) r = torch.tensor(r, dtype=torch.float) s2 = torch.tensor(s2, dtype=torch.float) done = torch.tensor(done, dtype=torch.float) s = s.cuda() a = a.cuda() r = r.cuda() s2 = s2.cuda() done = done.cuda() q_values = self.model(s).cuda() next_q_values = self.model(s2).cuda() next_q_state_values = self.target_model(s2).cuda() q_value = q_values.gather(1, a.unsqueeze(1)).squeeze(1) next_q_value = next_q_state_values.gather( 1, next_q_values.max(1)[1].unsqueeze(1)).squeeze(1) expected_q_value = r + self.config.gamma * next_q_value * (1 - done) loss = (q_value - expected_q_value.detach()).pow(2).mean() self.optim.zero_grad() loss.backward() self.optim.step() if t % self.config.update_interval == 0: self.target_model.load_state_dict(self.model.state_dict()) return loss.item() def load_weights(self, model_path): model = torch.load(model_path) if 'model' in model: self.model.load_state_dict(model['model']) else: self.model.load_state_dict(model) def save_checkpoint(self): os.makedirs('ckpt', exist_ok=True) torch.save(self.model.state_dict(), 'ckpt/model.pt') def load_checkpoint(self): self.model.load_state_dict('ckpt/model.pt') self.target_model.load_state_dict('ckpt/model.pt')
class Learner: def __init__(self, network, batch_size): self.learner_network = DQN(19).cuda().float() self.learner_target_network = DQN(19).cuda().float() self.learner_network.load_state_dict(network.state_dict()) self.learner_target_network.load_state_dict(network.state_dict()) self.shared_network = DQN(19).cpu() self.count = 0 self.batch_size = batch_size wandb.init(project='apex_dqfd_Learner', entity='neverparadise') # 1. sampling # 2. calculate gradient # 3. weight update # 4. compute priorities # 5. priorities of buffer update # 6. remove old memory def count(self): return self.count def get_network(self): self.shared_network.load_state_dict(self.learner_network.state_dict()) return self.shared_network def update_network(self, memory, demos, batch_size, optimizer, actor): while(ray.get(actor.get_counter.remote()) < 100): print("update_network") agent_batch, agent_idxs, agent_weights = ray.get(memory.sample.remote(batch_size)) demo_batch, demo_idxs, demo_weights = ray.get(demos.sample.remote(batch_size)) # demo_batch = (batch_size, state, action, reward, next_state, done, n_rewards) # print(len(demo_batch[0])) # 0번째 배치이므로 0이 나옴 state_list = [] action_list = [] reward_list = [] next_state_list = [] done_mask_list = [] n_rewards_list = [] for agent_transition in agent_batch: s, a, r, s_prime, done_mask, n_rewards = agent_transition state_list.append(s) action_list.append([a]) reward_list.append([r]) next_state_list.append(s_prime) done_mask_list.append([done_mask]) n_rewards_list.append([n_rewards]) for expert_transition in demo_batch: s, a, r, s_prime, done_mask, n_rewards = expert_transition state_list.append(s) action_list.append([a]) reward_list.append([r]) next_state_list.append(s_prime) done_mask_list.append([done_mask]) n_rewards_list.append([n_rewards]) s = torch.stack(state_list).float().cuda() a = torch.tensor(action_list, dtype=torch.int64).cuda() r = torch.tensor(reward_list).cuda() s_prime = torch.stack(next_state_list).float().cuda() done_mask = torch.tensor(done_mask_list).float().cuda() nr = torch.tensor(n_rewards_list).float().cuda() q_vals = self.learner_network(s) state_action_values = q_vals.gather(1, a) # comparing the q values to the values expected using the next states and reward next_state_values = self.learner_target_network(s_prime).max(1)[0].unsqueeze(1) target = r + (next_state_values * gamma * done_mask) # calculating the q loss, n-step return lossm supervised_loss is_weights = torch.FloatTensor(agent_weights).to(device) q_loss = (is_weights * F.mse_loss(state_action_values, target)).mean() n_step_loss = (state_action_values.max(1)[0] + nr).mean() supervised_loss = margin_loss(q_vals, a, 1, 1) loss = q_loss + supervised_loss + n_step_loss errors = torch.abs(state_action_values - target).data.cpu().detach() errors = errors.numpy() # update priority for i in range(batch_size): idx = agent_idxs[i] memory.update.remote(idx, errors[i]) # optimization step and logging optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm(self.learner_network.parameters(), 100) optimizer.step() torch.save(self.learner_network.state_dict(), model_path + "apex_dqfd_learner.pth") self.count +=1 if(self.count % 20 == 0 and self.count != 0): self.update_target_networks() print("leaner_network updated") return loss def update_target_networks(self): self.learner_target_network.load_state_dict(self.learner_network.state_dict()) print("leaner_target_network updated")
class Agent(): def __init__(self, args, env): self.action_space = env.action_space() self.atoms = args.atoms self.Vmin = args.V_min self.Vmax = args.V_max self.support = torch.linspace(args.V_min, args.V_max, args.atoms) # Support (range) of z self.delta_z = (args.V_max - args.V_min) / (args.atoms - 1) self.batch_size = args.batch_size self.n = args.multi_step self.discount = args.discount self.online_net = DQN(args, self.action_space) if args.model and os.path.isfile(args.model): self.online_net.load_state_dict( torch.load(args.model, map_location='cpu')) self.online_net.train() self.target_net = DQN(args, self.action_space) self.update_target_net() self.target_net.train() for param in self.target_net.parameters(): param.requires_grad = False self.optimiser = optim.Adam(self.online_net.parameters(), lr=args.lr, eps=args.adam_eps) if args.cuda: self.online_net.cuda() self.target_net.cuda() self.support = self.support.cuda() # Resets noisy weights in all linear layers (of online net only) def reset_noise(self): self.online_net.reset_noise() # Acts based on single state (no batch) def act(self, state): return (self.online_net(state.unsqueeze(0)).data * self.support).sum(2).max(1)[1][0] # Acts with an ε-greedy policy def act_e_greedy(self, state, epsilon=0.001): return random.randrange( self.action_space) if random.random() < epsilon else self.act( state) def learn(self, mem): # Sample transitions idxs, states, actions, returns, next_states, nonterminals, weights = mem.sample( self.batch_size) # Calculate current state probabilities self.online_net.reset_noise() # Sample new noise for online network ps = self.online_net(states) # Probabilities p(s_t, ·; θonline) ps_a = ps[range(self.batch_size), actions] # p(s_t, a_t; θonline) # Calculate nth next state probabilities self.online_net.reset_noise() # Sample new noise for action selection pns = self.online_net( next_states).data # Probabilities p(s_t+n, ·; θonline) dns = self.support.expand_as( pns) * pns # Distribution d_t+n = (z, p(s_t+n, ·; θonline)) argmax_indices_ns = dns.sum(2).max( 1 )[1] # Perform argmax action selection using online network: argmax_a[(z, p(s_t+n, a; θonline))] self.target_net.reset_noise() # Sample new target net noise pns = self.target_net( next_states).data # Probabilities p(s_t+n, ·; θtarget) pns_a = pns[range( self.batch_size ), argmax_indices_ns] # Double-Q probabilities p(s_t+n, argmax_a[(z, p(s_t+n, a; θonline))]; θtarget) # Compute Tz (Bellman operator T applied to z) Tz = returns.unsqueeze(1) + nonterminals * ( self.discount**self.n) * self.support.unsqueeze( 0) # Tz = R^n + (γ^n)z (accounting for terminal states) Tz = Tz.clamp(min=self.Vmin, max=self.Vmax) # Clamp between supported values # Compute L2 projection of Tz onto fixed support z b = (Tz - self.Vmin) / self.delta_z # b = (Tz - Vmin) / Δz l, u = b.floor().long(), b.ceil().long() # Fix disappearing probability mass when l = b = u (b is int) l[(u > 0) * (l == u)] -= 1 u[(l < (self.atoms - 1)) * (l == u)] += 1 # Distribute probability of Tz m = states.data.new(self.batch_size, self.atoms).zero_() offset = torch.linspace(0, ((self.batch_size - 1) * self.atoms), self.batch_size).unsqueeze(1).expand( self.batch_size, self.atoms).type_as(actions) m.view(-1).index_add_( 0, (l + offset).view(-1), (pns_a * (u.float() - b)).view(-1)) # m_l = m_l + p(s_t+n, a*)(u - b) m.view(-1).index_add_( 0, (u + offset).view(-1), (pns_a * (b - l.float())).view(-1)) # m_u = m_u + p(s_t+n, a*)(b - l) ps_a = ps_a.clamp(min=1e-3) # Clamp for numerical stability in log loss = -torch.sum( Variable(m) * ps_a.log(), 1) # Cross-entropy loss (minimises DKL(m||p(s_t, a_t))) self.online_net.zero_grad() (weights * loss).mean().backward() # Importance weight losses self.optimiser.step() mem.update_priorities( idxs, loss.data) # Update priorities of sampled transitions def update_target_net(self): self.target_net.load_state_dict(self.online_net.state_dict()) def save(self, path): torch.save(self.online_net.state_dict(), os.path.join(path, 'model.pth')) # Evaluates Q-value based on single state (no batch) def evaluate_q(self, state): return (self.online_net(state.unsqueeze(0)).data * self.support).sum(2).max(1)[0][0] def train(self): self.online_net.train() def eval(self): self.online_net.eval()
def train(env, args, writer): p1_current_model = DQN(env, args).to(args.device) p1_target_model = DQN(env, args).to(args.device) update_target(p1_current_model, p1_target_model) p2_current_model = DQN(env, args).to(args.device) p2_target_model = DQN(env, args).to(args.device) update_target(p2_current_model, p2_target_model) if args.noisy: p1_current_model.update_noisy_modules() p1_target_model.update_noisy_modules() p2_current_model.update_noisy_modules() p2_target_model.update_noisy_modules() if args.load_model and os.path.isfile(args.load_model): load_model(p1_current_model, args, 1) load_model(p2_current_model, args, 2) epsilon_by_frame = epsilon_scheduler(args.eps_start, args.eps_final, args.eps_decay) beta_by_frame = beta_scheduler(args.beta_start, args.beta_frames) if args.prioritized_replay: p1_replay_buffer = PrioritizedReplayBuffer(args.buffer_size, args.alpha) p2_replay_buffer = PrioritizedReplayBuffer(args.buffer_size, args.alpha) else: p1_replay_buffer = ReplayBuffer(args.buffer_size) p2_replay_buffer = ReplayBuffer(args.buffer_size) p1_state_deque = deque(maxlen=args.multi_step) p2_state_deque = deque(maxlen=args.multi_step) p1_reward_deque = deque(maxlen=args.multi_step) p1_action_deque = deque(maxlen=args.multi_step) p2_reward_deque = deque(maxlen=args.multi_step) p2_action_deque = deque(maxlen=args.multi_step) p1_optimizer = optim.Adam(p1_current_model.parameters(), lr=args.lr) p2_optimizer = optim.Adam(p2_current_model.parameters(), lr=args.lr) length_list = [] p1_reward_list, p1_loss_list = [], [] p2_reward_list, p2_loss_list = [], [] p1_episode_reward, p2_episode_reward = 0, 0 episode_length = 0 prev_time = time.time() prev_frame = 1 (p1_state, p2_state) = env.reset() for frame_idx in range(1, args.max_frames + 1): if args.noisy: p1_current_model.sample_noise() p1_target_model.sample_noise() p2_current_model.sample_noise() p2_target_model.sample_noise() epsilon = epsilon_by_frame(frame_idx) p1_action = p1_current_model.act(torch.FloatTensor(p1_state).to(args.device), epsilon) p2_action = p2_current_model.act(torch.FloatTensor(p2_state).to(args.device), epsilon) if args.render: env.render() actions = {"1": p1_action, "2": p2_action} (p1_next_state, p2_next_state), reward, done, _ = env.step(actions) p1_state_deque.append(p1_state) p2_state_deque.append(p2_state) if args.negative: p1_reward_deque.append(reward[0] - 1) else: p1_reward_deque.append(reward[0]) p1_action_deque.append(p1_action) if args.negative: p2_reward_deque.append(reward[1] - 1) else: p2_reward_deque.append(reward[1]) p2_action_deque.append(p2_action) if len(p1_state_deque) == args.multi_step or done: n_reward = multi_step_reward(p1_reward_deque, args.gamma) n_state = p1_state_deque[0] n_action = p1_action_deque[0] p1_replay_buffer.push(n_state, n_action, n_reward, p1_next_state, np.float32(done)) n_reward = multi_step_reward(p2_reward_deque, args.gamma) n_state = p2_state_deque[0] n_action = p2_action_deque[0] p2_replay_buffer.push(n_state, n_action, n_reward, p2_next_state, np.float32(done)) (p1_state, p2_state) = (p1_next_state, p2_next_state) p1_episode_reward += (reward[0]) p2_episode_reward += (reward[1]) if args.negative: p1_episode_reward -= 1 p2_episode_reward -= 1 episode_length += 1 if done or episode_length > args.max_episode_length: (p1_state, p2_state) = env.reset() p1_reward_list.append(p1_episode_reward) p2_reward_list.append(p2_episode_reward) length_list.append(episode_length) writer.add_scalar("data/p1_episode_reward", p1_episode_reward, frame_idx) writer.add_scalar("data/p2_episode_reward", p2_episode_reward, frame_idx) writer.add_scalar("data/episode_length", episode_length, frame_idx) p1_episode_reward, p2_episode_reward, episode_length = 0, 0, 0 p1_state_deque.clear() p2_state_deque.clear() p1_reward_deque.clear() p2_reward_deque.clear() p1_action_deque.clear() p2_action_deque.clear() if len(p1_replay_buffer) > args.learning_start and frame_idx % args.train_freq == 0: beta = beta_by_frame(frame_idx) loss = compute_td_loss(p1_current_model, p1_target_model, p1_replay_buffer, p1_optimizer, args, beta) p1_loss_list.append(loss.item()) writer.add_scalar("data/p1_loss", loss.item(), frame_idx) loss = compute_td_loss(p2_current_model, p2_target_model, p2_replay_buffer, p2_optimizer, args, beta) p2_loss_list.append(loss.item()) writer.add_scalar("data/p2_loss", loss.item(), frame_idx) if frame_idx % args.update_target == 0: update_target(p1_current_model, p1_target_model) update_target(p2_current_model, p2_target_model) if frame_idx % args.evaluation_interval == 0: print_log(frame_idx, prev_frame, prev_time, p1_reward_list, length_list, p1_loss_list) print_log(frame_idx, prev_frame, prev_time, p2_reward_list, length_list, p2_loss_list) p1_reward_list.clear(), p2_reward_list.clear(), length_list.clear() p1_loss_list.clear(), p2_loss_list.clear() prev_frame = frame_idx prev_time = time.time() save_model(p1_current_model, args, 1) save_model(p2_current_model, args, 2) save_model(p1_current_model, args, 1) save_model(p2_current_model, args, 2)
adam_learning_rate = 0.000625 epsilon = 1 epsilon_max = 1 epsilon_min = .01 decay_rate = 0.005 # choosen to reach min in around 400-500 episodes tau = .001 # for soft update, taken from (Continuous control with deep reinforcement learning) buffer_size = 1000 # replay buffer size memory_size = 72 # minimum memories needed for training replaybuffer = deque(maxlen=buffer_size) model = DQN(total_states, total_actions, seed) target = DQN(total_states, total_actions, seed) optimizer = optim.Adam(model.parameters(), lr=adam_learning_rate) max_steps = 1000 max_episodes = 2000 episode = 0 frames = 0 target_update_rate = 4 scores = list() cmr = list() score_bucket = deque(maxlen=100) score_bucket.append(0) runtimes = list() while episode < max_episodes and np.mean(score_bucket) < 200: state = env.reset() # Restart the game
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') if torch.cuda.is_available(): print('----- cuda available ----') else: print('----- cuda unavailable ----') policy_net = DQN(output=4).to(device) target_net = DQN(output=4).to(device) target_net.load_state_dict(policy_net.state_dict()) optimizer = torch.optim.Adam(policy_net.parameters(), lr=lr) env = gym.make('PongNoFrameskip-v4') #env = gym.make('Pong-v0') env = envwrapper.make_env(env) # prepare memory OPTIMIZE_THRESHOLD = 1000 capacity = OPTIMIZE_THRESHOLD * 10 replaymemory = memory.ReplayMemory(capacity) episode_rewards = train(env, EPISODE_NUM) plot_rewards(episode_rewards)
class Agent(): def __init__(self, args, env): self.action_space = env.action_space() self.atoms = args.atoms self.Vmin = args.V_min self.Vmax = args.V_max self.support = torch.linspace(args.V_min, args.V_max, args.atoms) # Support (range) of z self.delta_z = (args.V_max - args.V_min) / (args.atoms - 1) self.batch_size = args.batch_size self.n = args.multi_step self.discount = args.discount self.priority_exponent = args.priority_exponent self.max_gradient_norm = args.max_gradient_norm self.policy_net = DQN(args, self.action_space) if args.model and os.path.isfile(args.model): self.policy_net.load_state_dict(torch.load(args.model)) self.policy_net.train() self.target_net = DQN(args, self.action_space) self.update_target_net() self.target_net.eval() self.optimiser = optim.Adam(self.policy_net.parameters(), lr=args.lr, eps=args.adam_eps) if args.cuda: self.policy_net.cuda() self.target_net.cuda() self.support = self.support.cuda() # Resets noisy weights in all linear layers (of policy and target nets) def reset_noise(self): self.policy_net.reset_noise() self.target_net.reset_noise() # Acts based on single state (no batch) def act(self, state): return (self.policy_net(state.unsqueeze(0)).data * self.support).sum(2).max(1)[1][0] def learn(self, mem): idxs, states, actions, returns, next_states, nonterminals, weights = mem.sample(self.batch_size) batch_size = len(idxs) # May return less than specified if invalid transitions sampled # Calculate current state probabilities ps = self.policy_net(states) # Probabilities p(s_t, ·; θpolicy) ps_a = ps[range(batch_size), actions] # p(s_t, a_t; θpolicy) # Calculate nth next state probabilities pns = self.policy_net(next_states).data # Probabilities p(s_t+n, ·; θpolicy) dns = self.support.expand_as(pns) * pns # Distribution d_t+n = (z, p(s_t+n, ·; θpolicy)) argmax_indices_ns = dns.sum(2).max(1)[1] # Perform argmax action selection using policy network: argmax_a[(z, p(s_t+n, a; θpolicy))] pns = self.target_net(next_states).data # Probabilities p(s_t+n, ·; θtarget) pns_a = pns[range(batch_size), argmax_indices_ns] # Double-Q probabilities p(s_t+n, argmax_a[(z, p(s_t+n, a; θpolicy))]; θtarget) pns_a *= nonterminals # Set p = 0 for terminal nth next states as all possible expected returns = expected reward at final transition # Compute Tz (Bellman operator T applied to z) Tz = returns.unsqueeze(1) + nonterminals * (self.discount ** self.n) * self.support.unsqueeze(0) # Tz = R^n + (γ^n)z (accounting for terminal states) Tz = Tz.clamp(min=self.Vmin, max=self.Vmax) # Clamp between supported values # Compute L2 projection of Tz onto fixed support z b = (Tz - self.Vmin) / self.delta_z # b = (Tz - Vmin) / Δz l, u = b.floor().long(), b.ceil().long() # Distribute probability of Tz m = states.data.new(batch_size, self.atoms).zero_() offset = torch.linspace(0, ((batch_size - 1) * self.atoms), batch_size).long().unsqueeze(1).expand(batch_size, self.atoms).type_as(actions) m.view(-1).index_add_(0, (l + offset).view(-1), (pns_a * (u.float() - b)).view(-1)) # m_l = m_l + p(s_t+n, a*)(u - b) m.view(-1).index_add_(0, (u + offset).view(-1), (pns_a * (b - l.float())).view(-1)) # m_u = m_u + p(s_t+n, a*)(b - l) loss = -torch.sum(Variable(m) * ps_a.log(), 1) # Cross-entropy loss (minimises Kullback-Leibler divergence) self.policy_net.zero_grad() (weights * loss).mean().backward() # Importance weight losses nn.utils.clip_grad_norm(self.policy_net.parameters(), self.max_gradient_norm) # Clip gradients (normalising by max value of gradient L2 norm) self.optimiser.step() mem.update_priorities(idxs, loss.data.abs().pow(self.priority_exponent)) # Update priorities of sampled transitions def update_target_net(self): self.target_net.load_state_dict(self.policy_net.state_dict()) def save(self, path): torch.save(self.policy_net.state_dict(), os.path.join(path, 'model.pth')) # Evaluates Q-value based on single state (no batch) def evaluate_q(self, state): return (self.policy_net(state.unsqueeze(0)).data * self.support).sum(2).max(1)[0][0] def train(self): self.policy_net.train() def eval(self): self.policy_net.eval()
DEVICE = "cpu" print(f"Using device: {DEVICE}") print(f"Settings:\n{SETTINGS}") n_actions = len(SETTINGS["actions"]) n_episodes = SETTINGS["num_episodes"] max_episode_len = SETTINGS["max_episode_length"] dims = SETTINGS["world_dims"] eps = SETTINGS["eps"] policy_net = DQN(dims[0], dims[1], dims[2], n_actions).to(DEVICE) target_net = DQN(dims[0], dims[1], dims[2], n_actions).to(DEVICE) target_net.load_state_dict(policy_net.state_dict()) target_net.eval() optimizer = optim.RMSprop(policy_net.parameters()) memory = ExperienceReplay(100) total_steps = 0 env = gameEnv(partial=False, size=SETTINGS["world_size"]) # # Methods # ------------------------------------------------------------------------------ def select_action(state, eps, n_actions): f""" Chooses an action either randomly or from the policy. """
class DQNAgent: """ 初始化 @:param env_id : gym环境id """ def __init__(self, env_id, config): # gym self._env_id = env_id self._env = gym.make(env_id) self._state_size = self._env.observation_space.shape[0] self._action_size = self._env.action_space.n # 参数 self._gamma = config.gamma self._learning_rate = config.lr self._reward_boundary = config.reward_boundary self._device = torch.device("cuda" if config.cuda and torch.cuda.is_available() else "cpu") # model self._model = DQN(self._state_size, self._action_size).to(self._device) self._optimizer = torch.optim.Adam(self._model.parameters(), lr=self._learning_rate) # 经验池 self._replay_buffer = deque(maxlen=config.buffer_size) self._mini_batch = config.mini_batch # epsilon self._epsilon = config.epsilon self._epsilon_min = config.epsilon_min self._epsilon_decay = config.epsilon_decay """ 将observation放入双向队列中,队列满时自动删除最旧的元素 """ def remember(self, state, action, next_state, reward, done): self._replay_buffer.append((state, action, next_state, reward, done)) # epsilon幂指数下降 if len(self._replay_buffer) > self._mini_batch: if self._epsilon > self._epsilon_min: self._epsilon *= self._epsilon_decay pass """ epsilon-greedy action """ def act(self, state): # 类似模拟退火,random返回[0,1] if np.random.random() <= self._epsilon: return random.randrange(self._action_size) else: # numpy转成tensor,unsqueeze在下标0处新增一个维度 state = torch.tensor(state, dtype=torch.float).unsqueeze(0).to(self._device) # 模型预测 predict = self._model(state) # max在第1维处取最大,[1]为下标,[0]为值, [512*2]-> [521] return predict.max(1)[1].item() pass """ 训练 1、从双向队列中采样mini_batch 2、预测next_state 3、更新优化器 """ def replay(self): if len(self._replay_buffer) < self._mini_batch: return # 1、从双向队列中采样mini_batch mini_batch = random.sample(self._replay_buffer, self._mini_batch) # 载入方式一 # state = np.zeros((self._mini_batch, self._state_size)) # next_state = np.zeros((self._mini_batch, self._state_size)) # action, reward, done = [], [], [] # # for i in range(self._mini_batch): # state[i] = mini_batch[i][0] # action.append(mini_batch[i][1]) # next_state[i] = mini_batch[i][2] # reward.append(mini_batch[i][3]) # done.append(mini_batch[i][4]) # 载入方式二 state, action, next_state, reward, done = zip(*mini_batch) state = torch.tensor(state, dtype=torch.float).to(self._device) action = torch.tensor(action, dtype=torch.long).to(self._device) next_state = torch.tensor(next_state, dtype=torch.float).to(self._device) reward = torch.tensor(reward, dtype=torch.float).to(self._device) done = torch.tensor(done, dtype=torch.float).to(self._device) # 2、预测next_state q_target = reward + \ self._gamma * self._model(next_state).to(self._device).max(1)[0] * (1 - done) q_values = self._model(state).to(self._device).gather(1, action.unsqueeze(1)).squeeze(1) loss_func = nn.MSELoss() loss = loss_func(q_values, q_target) # loss = (q_values - q_target.detach()).pow(2).mean() # 3、更新优化器 self._optimizer.zero_grad() loss.backward() self._optimizer.step() return loss.item() """ 1、渲染gym环境开始交互 2、训练模型 """ def training(self): writer = SummaryWriter(comment="-train-" + self._env_id) print(self._model) # 参数 frame_index = 0 episode_index = 1 best_mean_reward = None mean_reward = 0 total_rewards = [] while mean_reward < self._reward_boundary: state = self._env.reset() # 一轮结束,reward置零 episode_reward = 0 while True: # 1、渲染gym环境开始交互 self._env.render() # 选择action进行交互 action = self.act(state) next_state, reward, done, _ = self._env.step(action) self.remember(state, action, next_state, reward, done) state = next_state frame_index += 1 episode_reward += reward # 2、训练模型 loss = self.replay() # 游戏结束,开始训练模型 if done: if loss is not None: print("episode: %4d, frames: %5d, reward: %5f, loss: %4f, epsilon: %4f" % ( episode_index, frame_index, np.mean(total_rewards[-10:]), loss, self._epsilon)) episode_index += 1 total_rewards.append(episode_reward) mean_reward = np.mean(total_rewards[-10:]) writer.add_scalar("epsilon", self._epsilon, frame_index) writer.add_scalar("episode_reward", episode_reward, frame_index) writer.add_scalar("mean_reward", mean_reward, frame_index) if best_mean_reward is None or best_mean_reward < mean_reward: torch.save(self._model.state_dict(), "training-best.dat") break self._env.close() pass def test(self, model_path): if model_path is None: return self._model.load_state_dict(torch.load(model_path)) self._model.eval() total_rewards = [] for episode_index in range(10): episode_reward = 0 done = False state = self._env.reset() while not done: action = self.act(state) next_state, reward, done, _ = self._env.step(action) state = next_state episode_reward += reward total_rewards.append(episode_reward) print("episode: %4d, reward: %5f" % (episode_index, np.mean(total_rewards[-10:])))
class DQNAgent: def __init__(self, state_size, action_size, config=RLConfig()): self.seed = random.seed(config.seed) self.state_size = state_size self.action_size = action_size self.batch_size = config.batch_size self.batch_indices = torch.arange(config.batch_size).long().to(device) self.samples_before_learning = config.samples_before_learning self.learn_interval = config.learning_interval self.parameter_update_interval = config.parameter_update_interval self.per_epsilon = config.per_epsilon self.tau = config.tau self.gamma = config.gamma if config.useDuelingDQN: self.qnetwork_local = DuelingDQN(state_size, action_size, config.seed).to(device) self.qnetwork_target = DuelingDQN(state_size, action_size, config.seed).to(device) else: self.qnetwork_local = DQN(state_size, action_size, config.seed).to(device) self.qnetwork_target = DQN(state_size, action_size, config.seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=config.learning_rate) self.doubleDQN = config.useDoubleDQN self.usePER = config.usePER if self.usePER: self.memory = PrioritizedReplayBuffer(config.buffer_size, config.per_alpha) else: self.memory = ReplayBuffer(config.buffer_size) self.t_step = 0 def act(self, state, eps=0.): state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() if random.random() < eps: return random.choice(np.arange(self.action_size)) else: return np.argmax(action_values.cpu().data.numpy()) def step(self, state, action, reward, next_state, done, beta): self.memory.add(state, action, reward, next_state, done) self.t_step += 1 if self.t_step % self.learn_interval == 0: if len(self.memory) > self.samples_before_learning: state = torch.from_numpy(state).float().unsqueeze(0).to(device) next_state = torch.from_numpy(next_state).float().unsqueeze( 0).to(device) target = self.qnetwork_local(state).data old_val = target[0][action] target_val = self.qnetwork_target(next_state).data if done: target[0][action] = reward else: target[0][ action] = reward + self.gamma * torch.max(target_val) if self.usePER: states, actions, rewards, next_states, dones, weights, indices = self.memory.sample( self.batch_size, beta) else: indices = None weights = None states, actions, rewards, next_states, dones = self.memory.sample( self.batch_size) self.learn(states, actions, rewards, next_states, dones, indices, weights, self.gamma) def learn(self, states, actions, rewards, next_states, dones, indices, weights, gamma): states = torch.from_numpy(np.vstack(states)).float().to(device) actions = torch.from_numpy(np.vstack(actions)).long().to(device) rewards = torch.from_numpy(np.vstack(rewards)).float().to(device) next_states = torch.from_numpy( np.vstack(next_states)).float().to(device) dones = torch.from_numpy(np.vstack(dones.astype( np.uint8))).float().to(device) Q_targets_next = self.qnetwork_target(next_states).detach() if self.doubleDQN: # choose the best action from the local network next_actions = self.qnetwork_local(next_states).argmax(dim=-1) Q_targets_next = Q_targets_next[self.batch_indices, next_actions] else: Q_targets_next = Q_targets_next.max(1)[0] Q_targets = rewards + gamma * Q_targets_next.reshape( (self.batch_size, 1)) * (1 - dones) pred = self.qnetwork_local(states) Q_expected = pred.gather(1, actions) if self.usePER: errors = torch.abs(Q_expected - Q_targets).data.numpy() + self.per_epsilon self.memory.update_priorities(indices, errors) self.optimizer.zero_grad() loss = F.mse_loss(Q_expected, Q_targets) loss.backward() self.optimizer.step() if self.t_step % self.parameter_update_interval == 0: self.soft_update(self.qnetwork_local, self.qnetwork_target, self.tau) def soft_update(self, qnetwork_local, qnetwork_target, tau): for local_param, target_param in zip(qnetwork_local.parameters(), qnetwork_target.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
import gym import gym_ple from argparse import Namespace from torch import optim from solver import DQNSolver from model import mse_loss, reward_positive_func, DQN config = {'env': 'FlappyBird-v0', 'lr': 1e-5, 'gamma': 0.99, 'seed': 1, 'buffer_size': 50000, 'init_eps': 0.1, 'final_eps': 0.02, 'eps_step': 1000000, 'observate_time': 1000, 'batch_size': 32, 'save_dir': 'results/dqn', 'update_target': 500, 'use_cuda': True, 'display_interval': 10, 'save_interval': 1000, 'episode': 150000, 'visdom': True, 'use_double': False, 'reward_len': 10, 'in_c': 4, 'num_actions': 2} config = Namespace(**config) env = gym.make(config.env) criterion = mse_loss reward_func = reward_positive_func model = DQN(in_c=config.in_c, num_actions=config.num_actions) optimizer = optim.Adam(model.parameters(), lr=config.lr) solver = DQNSolver(env, model, optimizer, criterion, reward_func, config) solver.train_q_learning()
return loss def update_target_networks(self): self.learner_target_network.load_state_dict(self.learner_network.state_dict()) print("leaner_target_network updated") ray.init() policy_net = DQN(19).cuda() target_net = DQN(19).cuda() target_net.load_state_dict(policy_net.state_dict()) memory = Memory.remote(50000) demos = Memory.remote(25000) optimizer = optim.Adam(policy_net.parameters(), lr=learning_rate, weight_decay=1e-5) # Copy network params from pretrained Agent model_path = './dqn_model/pre_trained6.pth' policy_net.load_state_dict(torch.load(model_path, map_location='cuda:0')) target_net.load_state_dict(policy_net.state_dict()) #parse_demo2.remote("MineRLTreechop-v0", demos, policy_net.cpu(), target_net.cpu(), optimizer, threshold=60, num_epochs=1, batch_size=4, seq_len=60, gamma=0.99, model_name='pre_trained4.pth') # learner network initialzation batch_size = 256 demo_prob = 0.5 learner = Learner.remote(policy_net, batch_size) # actor network, environments initialization # Generating each own instances
def main(): env = gym.make(config.ENV_NAME) agent = DQN(env) optimizer = optim.Adam(agent.parameters(), lr=0.001) finished = False for epoch in range(config.EPOCHS): state = env.reset() for step in range(config.ITERATIONS): action = agent.get_action(state, 'egreedy') next_state, reward, done, _ = env.step(action[0, 0]) if done: reward = -1 agent.replay_memory.push(Transition( config.FloatTensor([state]), action, config.FloatTensor([reward]), config.FloatTensor([next_state]) if not done else None)) state = next_state if len(agent.replay_memory) >= config.BATCH_SIZE: batch = agent.replay_memory.sample(config.BATCH_SIZE) batch = Transition(*zip(*batch)) non_final_mask = config.ByteTensor( [s is not None for s in batch.next_state]) non_final_next_state_batch = Variable(torch.cat([ s for s in batch.next_state if s is not None])) state_batch = Variable(torch.cat(batch.state), requires_grad=False) action_batch = Variable(torch.cat(batch.action).view(-1, 1), requires_grad=False) reward_batch = Variable(torch.cat(batch.reward), requires_grad=False) q_values = agent(state_batch).gather(1, action_batch) s_values = Variable(torch.zeros(config.BATCH_SIZE).type( config.FloatTensor), requires_grad=False) s_values[non_final_mask] = agent( non_final_next_state_batch).max(1)[0] expected_q_values = config.GAMMA * s_values + reward_batch loss = F.smooth_l1_loss(torch.sum(q_values), torch.sum(expected_q_values)) optimizer.zero_grad() loss.backward() optimizer.step() if done: break agent.epsilon = config.EPSILON_START - epoch / config.EPOCHS * ( config.EPSILON_START - config.EPSILON_END) if epoch % config.TEST_INTERVAL == 0: sum_reward = 0 for _epoch in range(config.TEST_EPOCHS): epoch_reward = 0 state = env.reset() for step in range(config.TEST_ITERATIONS): # env.render() action = agent.get_action(state) # Default state, reward, done, _ = env.step(action[0, 0]) if done: break epoch_reward += reward sum_reward += epoch_reward avg_reward = sum_reward / config.TEST_EPOCHS print('Epoch: {}, Average Reward: {}'.format(epoch, avg_reward)) print('Current Epsilon:', agent.epsilon) if avg_reward > 195: finished = True if finished: break while True: state = env.reset() round_reward = 0 for step in range(config.TEST_ITERATIONS): env.render() action = agent.get_action(state) # Default state, reward, done, _ = env.step(action[0, 0]) if done: break round_reward += reward print('Round reward:', round_reward)
class Agent(object): def __init__(self, args, action_space): self.action_space = action_space self.batch_size = args.batch_size self.discount = args.discount self.online_net = DQN(args, self.action_space).to(device=args.device) self.online_net.train() self.target_net = DQN(args, self.action_space).to(device=args.device) self.update_target_net() self.target_net.train() for param in self.target_net.parameters(): param.requires_grad = False self.optimiser = optim.Adam(self.online_net.parameters(), lr=args.lr, eps=args.adam_eps) self.loss_func = nn.MSELoss() # Acts based on single state (no batch) def act(self, state): with torch.no_grad(): return self.online_net([state]).argmax(1).item() # Acts with an ε-greedy policy (used for evaluation only) def act_e_greedy( self, state, epsilon=0.05): # High ε can reduce evaluation scores drastically return random.randrange( self.action_space) if random.random() < epsilon else self.act( state) def learn(self, mem): # Sample transitions states, actions, next_states, rewards = mem.sample(self.batch_size) q_eval = self.online_net(states).gather( 1, actions.unsqueeze(1)).squeeze() with torch.no_grad(): q_eval_next_a = self.online_net(next_states).argmax(1) q_next = self.target_net(next_states) q_target = rewards + self.discount * q_next.gather( 1, q_eval_next_a.unsqueeze(1)).squeeze() loss = self.loss_func(q_eval, q_target) self.online_net.zero_grad() loss.backward() self.optimiser.step() def update_target_net(self): self.target_net.load_state_dict(self.online_net.state_dict()) # Save model parameters on current device (don't move model between devices) def save(self, path): torch.save(self.online_net.state_dict(), path + '.pth') # Evaluates Q-value based on single state (no batch) def evaluate_q(self, state): with torch.no_grad(): return (self.online_net([state])).max(1)[0].item() def train(self): self.online_net.train() def eval(self): self.online_net.eval()
class Agent(): def __init__(self, action_size): self.action_size = action_size # These are hyper parameters for the DQN self.discount_factor = 0.99 self.epsilon = 1.0 self.epsilon_min = 0.01 self.explore_step = 500000 self.epsilon_decay = (self.epsilon - self.epsilon_min) / self.explore_step self.train_start = 100000 self.update_target = 1000 # Generate the memory self.memory = ReplayMemory() # Create the policy net and the target net self.policy_net = DQN(action_size) self.policy_net.to(device) self.target_net = DQN(action_size) self.target_net.to(device) self.optimizer = optim.Adam(params=self.policy_net.parameters(), lr=learning_rate) self.scheduler = optim.lr_scheduler.StepLR( self.optimizer, step_size=scheduler_step_size, gamma=scheduler_gamma) # Initialize a target network and initialize the target network to the policy net ### CODE ### self.update_target_net() def load_policy_net(self, path): self.policy_net = torch.load(path) # after some time interval update the target net to be same with policy net def update_target_net(self): ### CODE ### self.target_net.load_state_dict(self.policy_net.state_dict()) """Get action using policy net using epsilon-greedy policy""" def get_action(self, state): if np.random.rand() <= self.epsilon: ### CODE #### (copy over from agent.py!) return torch.tensor([[random.randrange(self.action_size)]], device=device, dtype=torch.long) else: ### CODE #### (copy over from agent.py!) with torch.no_grad(): state = torch.FloatTensor(state).unsqueeze(0).cuda() return self.policy_net(state).max(1)[1].view(1, 1) # pick samples randomly from replay memory (with batch_size) def train_policy_net(self, frame): if self.epsilon > self.epsilon_min: self.epsilon -= self.epsilon_decay mini_batch = self.memory.sample_mini_batch(frame) mini_batch = np.array(mini_batch).transpose() history = np.stack(mini_batch[0], axis=0) states = np.float32(history[:, :4, :, :]) / 255. states = torch.from_numpy(states).cuda() actions = list(mini_batch[1]) actions = torch.LongTensor(actions).cuda() rewards = list(mini_batch[2]) rewards = torch.FloatTensor(rewards).cuda() next_states = np.float32(history[:, 1:, :, :]) / 255. next_states = torch.tensor(next_states).cuda() dones = mini_batch[3] # checks if the game is over musk = torch.tensor(list(map(int, dones == False)), dtype=torch.bool) # Your agent.py code here with double DQN modifications ### CODE ### # Compute Q(s_t, a), the Q-value of the current state ### CODE #### state_action_values = self.policy_net(states).gather( 1, actions.view(batch_size, -1)) # Compute Q function of next state ### CODE #### next_state_values = torch.zeros(batch_size, device=device).cuda() non_final_mask = torch.tensor(tuple( map(lambda s: s is not None, next_states)), device=device, dtype=torch.uint8) non_final_next_states = torch.cat([ i for i in next_states if i is not None ]).view(states.size()).cuda() # Compute the expected Q values next_state_values[non_final_mask] = self.target_net( non_final_next_states).max(1)[0].detach() expected_state_action_values = (next_state_values * self.discount_factor) + rewards # Compute the Huber Loss ### CODE #### loss = F.smooth_l1_loss(state_action_values.view(32), expected_state_action_values) # Optimize the model, .step() both the optimizer and the scheduler! ### CODE #### self.optimizer.zero_grad() loss.backward() for param in self.policy_net.parameters(): param.grad.data.clamp_(-1, 1) self.optimizer.step()
class Agent: def __init__(self, game: str, replay_buffer_capacity: int, replay_start_size: int, batch_size: int, discount_factor: float, lr: float, device: str = 'cuda:0', env_seed: int = 0, frame_buffer_size: int = 4, print_self=True): self.device = device self.discount_factor = discount_factor self.game = game self.batch_size = batch_size self.replay_buf = ReplayBuffer(capacity=replay_buffer_capacity) self.env = FrameStack( AtariPreprocessing( gym.make(self.game), # noop_max=0, # terminal_on_life_loss=True, scale_obs=False), num_stack=frame_buffer_size) self.env.seed(env_seed) self.reset() self.n_action = self.env.action_space.n self.policy_net = DQN(self.n_action).to(self.device) self.target_net = DQN(self.n_action).to(self.device).eval() self.optimizer = RMSprop( self.policy_net.parameters(), alpha=0.95, # momentum=0.95, eps=0.01) if print_self: print(self) self._fill_replay_buf(replay_start_size) def __repr__(self): return '\n'.join([ 'Agent:', f'Game: {self.game}', f'Device: {self.device}', f'Policy net: {self.policy_net}', f'Target net: {self.target_net}', f'Replay buf: {self.replay_buf}' ]) def _fill_replay_buf(self, replay_start_size): for _ in trange(replay_start_size, desc='Fill replay_buf randomly', leave=True): self.step(1.0) def reset(self): """Reset the end, pre-populate self.frame_buf and self.state""" self.state = self.env.reset() @torch.no_grad() def step(self, epsilon, clip_reward=True): """ Choose an action based on current state and epsilon-greedy policy """ # Choose action if random.random() <= epsilon: q_values = None action = self.env.action_space.sample() else: torch_state = torch.tensor(self.state, dtype=torch.float32, device=self.device).unsqueeze(0) / 255.0 q_values = self.policy_net(torch_state) action = int(q_values.argmax(dim=1).item()) # Apply action next_state, reward, done, _ = self.env.step(action) if clip_reward: reward = max(-1.0, min(reward, 1.0)) # Store into replay buffer self.replay_buf.append( (torch.tensor( np.array(self.state), dtype=torch.float32, device="cpu") / 255., action, reward, torch.tensor( np.array(next_state), dtype=torch.float32, device="cpu") / 255., done)) # Advance to next state self.state = next_state if done: self.reset() return reward, q_values, done def q_update(self): self.optimizer.zero_grad() states, actions, rewards, next_states, dones = [ x.to(self.device) for x in self.replay_buf.sample(self.batch_size) ] with torch.no_grad(): y = torch.where( dones, rewards, rewards + self.discount_factor * self.target_net(next_states).max(1)[0]) predicted_values = self.policy_net(states).gather( 1, actions.unsqueeze(-1)).squeeze(-1) loss = huber(y, predicted_values, 2.) loss.backward() self.optimizer.step() return (y - predicted_values).abs().mean()
class Agent(): def __init__(self, args, env): self.action_space = env.action_space() self.atoms = args.atoms self.Vmin = args.V_min self.Vmax = args.V_max self.support = torch.linspace(args.V_min, args.V_max, self.atoms).to( device=args.device) # Support (range) of z self.delta_z = (args.V_max - args.V_min) / (self.atoms - 1) self.batch_size = args.batch_size self.n = args.multi_step self.discount = args.discount self.norm_clip = args.norm_clip self.online_net = DQN(args, self.action_space).to(device=args.device) if args.model: # Load pretrained model if provided if os.path.isfile(args.model): state_dict = torch.load( args.model, map_location='cpu' ) # Always load tensors onto CPU by default, will shift to GPU if necessary if 'conv1.weight' in state_dict.keys(): for old_key, new_key in (('conv1.weight', 'convs.0.weight'), ('conv1.bias', 'convs.0.bias'), ('conv2.weight', 'convs.2.weight'), ('conv2.bias', 'convs.2.bias'), ('conv3.weight', 'convs.4.weight'), ('conv3.bias', 'convs.4.bias')): state_dict[new_key] = state_dict[ old_key] # Re-map state dict for old pretrained models del state_dict[ old_key] # Delete old keys for strict load_state_dict self.online_net.load_state_dict(state_dict) print("Loading pretrained model: " + args.model) else: # Raise error if incorrect model path provided raise FileNotFoundError(args.model) self.online_net.train() self.target_net = DQN(args, self.action_space).to(device=args.device) self.update_target_net() self.target_net.train() for param in self.target_net.parameters(): param.requires_grad = False # self.optimiser = optim.Adam(self.online_net.parameters(), lr=args.learning_rate, eps=args.adam_eps) self.convs_optimiser = optim.Adam(self.online_net.convs.parameters(), lr=args.learning_rate, eps=args.adam_eps) self.linear_optimiser = optim.Adam(chain( self.online_net.fc_h_v.parameters(), self.online_net.fc_h_a.parameters(), self.online_net.fc_z_v.parameters(), self.online_net.fc_z_a.parameters()), lr=args.learning_rate, eps=args.adam_eps) # Resets noisy weights in all linear layers (of online net only) def reset_noise(self): self.online_net.reset_noise() # Acts based on single state (no batch) def act(self, state): with torch.no_grad(): # don't count these calls since it is accounted for after "action = dqn.act(state)" in main.py ret = (self.online_net(state.unsqueeze(0)) * self.support).sum(2).argmax(1).item() return ret # Acts with an ε-greedy policy (used for evaluation only) def act_e_greedy( self, state, epsilon=0.001): # High ε can reduce evaluation scores drastically return np.random.randint( 0, self.action_space ) if np.random.random() < epsilon else self.act(state) def learn(self, mem, freeze=False): # Sample transitions idxs, states, actions, returns, next_states, nonterminals, weights, _ = mem.sample( self.batch_size) # Calculate current state probabilities (online network noise already sampled) log_ps = self.online_net( states, log=True) # Log probabilities log p(s_t, ·; θonline) log_ps_a = log_ps[range(self.batch_size), actions] # log p(s_t, a_t; θonline) with torch.no_grad(): # Calculate nth next state probabilities pns = self.online_net( next_states) # Probabilities p(s_t+n, ·; θonline) dns = self.support.expand_as( pns) * pns # Distribution d_t+n = (z, p(s_t+n, ·; θonline)) argmax_indices_ns = dns.sum(2).argmax( 1 ) # Perform argmax action selection using online network: argmax_a[(z, p(s_t+n, a; θonline))] self.target_net.reset_noise() # Sample new target net noise pns = self.target_net( next_states) # Probabilities p(s_t+n, ·; θtarget) pns_a = pns[range( self.batch_size ), argmax_indices_ns] # Double-Q probabilities p(s_t+n, argmax_a[(z, p(s_t+n, a; θonline))]; θtarget) # Compute Tz (Bellman operator T applied to z) Tz = returns.unsqueeze(1) + nonterminals * ( self.discount**self.n) * self.support.unsqueeze( 0) # Tz = R^n + (γ^n)z (accounting for terminal states) Tz = Tz.clamp(min=self.Vmin, max=self.Vmax) # Clamp between supported values # Compute L2 projection of Tz onto fixed support z b = (Tz - self.Vmin) / self.delta_z # b = (Tz - Vmin) / Δz l, u = b.floor().to(torch.int64), b.ceil().to(torch.int64) # Fix disappearing probability mass when l = b = u (b is int) l[(u > 0) * (l == u)] -= 1 u[(l < (self.atoms - 1)) * (l == u)] += 1 # Distribute probability of Tz m = states.new_zeros(self.batch_size, self.atoms) offset = torch.linspace(0, ((self.batch_size - 1) * self.atoms), self.batch_size).unsqueeze(1).expand( self.batch_size, self.atoms).to(actions) m.view(-1).index_add_( 0, (l + offset).view(-1), (pns_a * (u.float() - b)).view(-1)) # m_l = m_l + p(s_t+n, a*)(u - b) m.view(-1).index_add_( 0, (u + offset).view(-1), (pns_a * (b - l.float())).view(-1)) # m_u = m_u + p(s_t+n, a*)(b - l) loss = -torch.sum( m * log_ps_a, 1) # Cross-entropy loss (minimises DKL(m||p(s_t, a_t))) self.online_net.zero_grad() loss.mean().backward( ) # Backpropagate importance-weighted minibatch loss clip_grad_norm_(self.online_net.parameters(), self.norm_clip) # Clip gradients by L2 norm # self.optimiser.step() if not freeze: self.convs_optimiser.step() self.linear_optimiser.step() def learn_with_latent(self, latent_mem): # Sample transitions idxs, states, actions, returns, next_states, nonterminals, weights, ns = latent_mem.sample( self.batch_size) # Calculate current state probabilities (online network noise already sampled) log_ps = self.online_net.forward_with_latent( states, log=True) # Log probabilities log p(s_t, ·; θonline) log_ps_a = log_ps[range(self.batch_size), actions] # log p(s_t, a_t; θonline) with torch.no_grad(): # Calculate nth next state probabilities pns = self.online_net.forward_with_latent( next_states) # Probabilities p(s_t+n, ·; θonline) dns = self.support.expand_as( pns) * pns # Distribution ds_t+n = (z, p(s_t+n, ·; θonline)) argmax_indices_ns = dns.sum(2).argmax( 1 ) # Perform argmax action selection using online network: argmax_a[(z, p(s_t+n, a; θonline))] self.target_net.reset_noise() # Sample new target net noise pns = self.target_net.forward_with_latent( next_states) # Probabilities p(s_t+n, ·; θtarget) pns_a = pns[range( self.batch_size ), argmax_indices_ns] # Double-Q probabilities p(s_t+n, argmax_a[(z, p(s_t+n, a; θonline))]; θtarget) # use ns instead of self.n since n is possibly different for each sequence in the batch ns = torch.tensor(ns, device=latent_mem.device).unsqueeze(1) # Compute Tz (Bellman operator T applied to z) Tz = returns.unsqueeze(1) + nonterminals * ( self.discount**ns) * self.support.unsqueeze( 0) # Tz = R^n + (γ^n)z (accounting for terminal states) Tz = Tz.clamp(min=self.Vmin, max=self.Vmax) # Clamp between supported values # Compute L2 projection of Tz onto fixed support z b = (Tz - self.Vmin) / self.delta_z # b = (Tz - Vmin) / Δz l, u = b.floor().to(torch.int64), b.ceil().to(torch.int64) # Fix disappearing probability mass when l = b = u (b is int) l[(u > 0) * (l == u)] -= 1 u[(l < (self.atoms - 1)) * (l == u)] += 1 # Distribute probability of Tz m = states.new_zeros(self.batch_size, self.atoms) offset = torch.linspace(0, ((self.batch_size - 1) * self.atoms), self.batch_size).unsqueeze(1).expand( self.batch_size, self.atoms).to(actions) m.view(-1).index_add_( 0, (l + offset).view(-1), (pns_a * (u.float() - b)).view(-1)) # m_l = m_l + p(s_t+n, a*)(u - b) m.view(-1).index_add_( 0, (u + offset).view(-1), (pns_a * (b - l.float())).view(-1)) # m_u = m_u + p(s_t+n, a*)(b - l) loss = -torch.sum( m * log_ps_a, 1) # Cross-entropy loss (minimises DKL(m||p(s_t, a_t))) self.online_net.zero_grad() loss.mean().backward( ) # Backpropagate importance-weighted minibatch loss clip_grad_norm_(self.online_net.parameters(), self.norm_clip) # Clip gradients by L2 norm # self.optimiser.step() self.linear_optimiser.step() def update_target_net(self): self.target_net.load_state_dict(self.online_net.state_dict()) # Save model parameters on current device (don't move model between devices) def save(self, path, name='model.pth'): torch.save(self.online_net.state_dict(), os.path.join(path, name)) # Evaluates Q-value based on single state (no batch) def evaluate_q(self, state): with torch.no_grad(): return (self.online_net(state.unsqueeze(0)) * self.support).sum(2).max(1)[0].item() def train(self): self.online_net.train() def eval(self): self.online_net.eval()
class DQNAgent: def __init__(self, config: Config): self.config = config self.is_training = True self.buffer = ReplayBuffer(self.config.max_buff) self.model = DQN(self.config.state_dim, self.config.action_dim).cuda() self.model_optim = Adam(self.model.parameters(), lr=self.config.learning_rate) if self.config.use_cuda: self.cuda() def act(self, state, epsilon=None): if epsilon is None: epsilon = self.config.epsilon_min if random.random() > epsilon or not self.is_training: state = torch.tensor(state, dtype=torch.float).unsqueeze(0) if self.config.use_cuda: state = state.cuda() q_value = self.model.forward(state) action = q_value.max(1)[1].item() else: action = random.randrange(self.config.action_dim) return action def learning(self, fr): s0, a, r, s1, done = self.buffer.sample(self.config.batch_size) s0 = torch.tensor(s0, dtype=torch.float) s1 = torch.tensor(s1, dtype=torch.float) a = torch.tensor(a, dtype=torch.long) r = torch.tensor(r, dtype=torch.float) done = torch.tensor(done, dtype=torch.float) if self.config.use_cuda: s0 = s0.cuda() s1 = s1.cuda() a = a.cuda() r = r.cuda() done = done.cuda() q_values = self.model(s0).cuda() next_q_values = self.model(s1).cuda() next_q_value = next_q_values.max(1)[0] q_value = q_values.gather(1, a.unsqueeze(1)).squeeze(1) expected_q_value = r + self.config.gamma * next_q_value * (1 - done) # Notice that detach the expected_q_value loss = (q_value - expected_q_value.detach()).pow(2).mean() self.model_optim.zero_grad() loss.backward() self.model_optim.step() return loss.item() def cuda(self): self.model.cuda() def load_weights(self, model_path): if model_path is None: return self.model.load_state_dict(torch.load(model_path)) def save_model(self, output, tag=''): torch.save(self.model.state_dict(), '%s/model_%s.pkl' % (output, tag)) def save_config(self, output): with open(output + '/config.txt', 'w') as f: attr_val = get_class_attr_val(self.config) for k, v in attr_val.items(): f.write(str(k) + " = " + str(v) + "\n")
class DQNAgent: def __init__(self, env, render, config_info): self.env = env self._reset_env() self.render = render # Set seeds self.seed = 0 env.seed(self.seed) torch.manual_seed(self.seed) np.random.seed(self.seed) self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print(f"Device in use : {self.device}") # Define checkpoint checkpoint = Checkpoint(self.device, **config_info) # Create / load checkpoint dict ( self.ckpt, self.path_ckpt_dict, self.path_ckpt, config, ) = checkpoint.manage_checkpoint() # Unroll useful parameters from config dict self.batch_size = config["training"]["batch_size"] self.max_timesteps = config["training"]["max_timesteps"] self.replay_size = config["training"]["replay_size"] self.start_temp = config["training"]["start_temperature"] self.final_temp = config["training"]["final_temperature"] self.decay_temp = config["training"]["decay_temperature"] self.gamma = config["training"]["gamma"] self.early_stopping = config["training"]["early_stopping"] self.update_frequency = config["training"]["update_frequency"] self.eval_frequency = config["training"]["eval_frequency"] # Define state and action dimension spaces state_dim = env.observation_space.shape[0] action_dim = env.action_space.n # Define Q-network and target Q-network self.network = DQN(state_dim, action_dim, **config["model"]).to(self.device) self.target_network = DQN(state_dim, action_dim, **config["model"]).to( self.device ) # Loss and optimizer self.criterion = nn.MSELoss() lr = config["optimizer"]["learning_rate"] self.optimizer = optim.Adam(self.network.parameters(), lr=lr) # Load network's weight if resume training checkpoint.load_weights( self.ckpt, self.network, self.target_network, self.optimizer ) # Initialize replay buffer self.replay_buffer = ReplayBuffer(self.replay_size) self.transition = namedtuple( "transition", field_names=["state", "action", "reward", "done", "next_state"], ) def _reset_env(self): self.state, self.done = self.env.reset(), False self.episode_reward = 0.0 def play_step(self, temperature=1): reward_signal = None # Boltmann exploration state_v = torch.tensor(self.state, dtype=torch.float32).to(self.device) q_values = self.network(state_v) probas = Categorical(F.softmax(q_values / temperature, dim=0)) action = probas.sample().item() # Perform one step in the environment next_state, reward, self.done, _ = self.env.step(action) # Create a tuple for the new transition new_transition = self.transition( self.state, action, reward, self.done, next_state ) # Add transition to the replay buffer self.replay_buffer.store_transition(new_transition) self.state = next_state self.episode_reward += reward if self.render: self.env.render() if self.done: reward_signal = self.episode_reward self._reset_env() return reward_signal def train(self): # Initializations all_episode_rewards = [] episode_timestep = 0 best_mean_reward = None episode_num = 0 temp = self.start_temp # start epsilon to explore while filling the buffer writer = SummaryWriter(log_dir=self.path_ckpt, comment="-dqn") # Evaluate untrained policy evaluations = [self.eval_policy()] # Training loop for t in range(int(self.max_timesteps)): episode_timestep += 1 # -> is None if episode is not terminated # -> is episode reward when episode is terminated reward_signal = self.play_step(temp) # when episode is terminated if reward_signal is not None: episode_reward = reward_signal mean_reward = np.mean(all_episode_rewards[-10:]) print( f"Timestep [{t + 1}/{int(self.max_timesteps)}] ; " f"Episode num : {episode_num + 1} ; " f"Episode length : {episode_timestep} ; " f"Reward : {episode_reward:.2f} ; " f"Mean reward {mean_reward:.2f}" ) # Save episode's reward & reset counters all_episode_rewards.append(episode_reward) episode_timestep = 0 episode_num += 1 # Save checkpoint self.ckpt["episode_num"] = episode_num self.ckpt["all_episode_rewards"].append(episode_reward) self.ckpt["optimizer_state_dict"] = self.optimizer.state_dict() torch.save(self.ckpt, self.path_ckpt_dict) writer.add_scalar("episode reward", episode_reward, t) writer.add_scalar("mean reward", mean_reward, t) # Save network if performance is better than average if best_mean_reward is None or best_mean_reward < mean_reward: self.ckpt["best_mean_reward"] = mean_reward self.ckpt["model_state_dict"] = self.network.state_dict() self.ckpt[ "target_model_state_dict" ] = self.target_network.state_dict() if best_mean_reward is not None: print(f"Best mean reward updated : {best_mean_reward}") best_mean_reward = mean_reward # Criterion to early stop training if mean_reward > self.early_stopping: self.plot_reward() print(f"Solved in {t + 1} timesteps!") break # Fill the replay buffer if len(self.replay_buffer) < self.replay_size: continue else: # Adjust exploration parameter temp = np.maximum( self.final_temp, self.start_temp - (t / self.decay_temp) ) writer.add_scalar("temperature", temp, t) # Get the weights of the network before update weights_network = self.network.state_dict() # when it's time perform a batch gradient descent if t % self.update_frequency == 0: # Backward and optimize self.optimizer.zero_grad() batch = self.replay_buffer.sample_buffer(self.batch_size) loss = self.train_on_batch(batch) loss.backward() self.optimizer.step() # Synchronize target network self.target_network.load_state_dict(weights_network) # Evaluate episode if (t + 1) % self.eval_frequency == 0: evaluations.append(self.eval_policy()) np.save(self.path_ckpt, evaluations) def train_on_batch(self, batch_samples): # Unpack batch_size of transitions randomly drawn from the replay buffer states, actions, rewards, dones, next_states = batch_samples # Transform np arrays into tensors and send them to device states_v = torch.tensor(states).to(self.device) next_states_v = torch.tensor(next_states).to(self.device) actions_v = torch.tensor(actions).to(self.device) rewards_v = torch.tensor(rewards).to(self.device) dones_bool = torch.tensor(dones, dtype=torch.bool).to(self.device) # Vectorized version q_vals = self.network(states_v) # dim=batch_size x num_actions # Get the Q-values corresponding to the action q_vals = q_vals.gather(1, actions_v.view(-1, 1)) q_vals = q_vals.view(1, -1)[0] target_next_q_vals = self.target_network(next_states_v) # Max action of the target Q-values target_max_next_q_vals, _ = torch.max(target_next_q_vals, dim=1) # If state is terminal target_max_next_q_vals[dones_bool] = 0.0 # No update of the target during backpropagation target_max_next_q_vals = target_max_next_q_vals.detach() # Bellman approximation for target Q-values target_q_vals = rewards_v + self.gamma * target_max_next_q_vals return self.criterion(q_vals, target_q_vals) def eval_policy(self, eval_episodes=10): # Runs policy for X episodes and returns average reward # A fixed seed is used for the eval environment self.env.seed(self.seed + 100) avg_reward = 0.0 temperature = 1 for _ in range(eval_episodes): self._reset_env() reward_signal = None while reward_signal is None: reward_signal = self.play_step(temperature) avg_reward += reward_signal avg_reward /= eval_episodes print("---------------------------------------") print(f"Evaluation over {eval_episodes} episodes: {avg_reward:.3f}") print("---------------------------------------") return avg_reward def plot_reward(self): plt.plot(self.ckpt["all_episode_rewards"]) plt.xlabel("Episode") plt.ylabel("Reward") plt.title(f"Reward evolution for {self.env.unwrapped.spec.id} Gym environment") plt.tight_layout() path_fig = os.path.join(self.path_ckpt, "figure.png") plt.savefig(path_fig) print(f"Figure saved to {path_fig}") plt.show()
env = StocksEnv(prices) env_val = StocksEnv(val_price) net = DQN(env.observation_space.shape[0], env.action_space.n) tgt_net = ptan.agent.TargetNet(net) selector = ptan.actions.EpsilonGreedyActionSelector( epsilon=params.epsilon_start) epsilon_tracker = ptan.actions.EpsilonTracker( selector, params.epsilon_start, params.epsilon_final, params.epsilon_steps) agent = ptan.agent.DQNAgent(net, selector, device=device) exp_source = ptan.experience.ExperienceSourceFirstLast( env, agent, gamma=args.gamma, steps_count=args.reward_steps) buffer = ptan.experience.ExperienceReplayBuffer( exp_source, buffer_size=params.replay_size) optimizer = optim.Adam(net.parameters(), lr=params.learning_rate) def process_batch(engine, batch): optimizer.zero_grad() loss_v = common.calc_loss_dqn( batch, net, tgt_net.target_model, gamma=params.gamma, device=device) loss_v.backward() optimizer.step() epsilon_tracker.frame(engine.state.iteration) if engine.state.iteration % params.target_net_sync == 0: tgt_net.sync() return { 'loss': loss_v.item(), 'epsilon': selector.epsilon }
class Agent: """ The intelligent agent of the simulation. Set the model of the neural network used and general parameters. It is responsible to select the actions, optimize the neural network and manage the models. """ def __init__(self, action_set, train=True, load_path=None): #1. Initialize agent params self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.action_set = action_set self.action_number = len(action_set) self.steps_done = 0 self.epsilon = Config.EPS_START self.episode_durations = [] #2. Build networks self.policy_net = DQN().to(self.device) self.target_net = DQN().to(self.device) self.optimizer = optim.RMSprop(self.policy_net.parameters(), lr=Config.LEARNING_RATE) if not train: self.optimizer = optim.RMSprop(self.policy_net.parameters(), lr=0) self.policy_net.load(load_path, optimizer=self.optimizer) self.policy_net.eval() self.target_net.load_state_dict(self.policy_net.state_dict()) self.target_net.eval() #3. Create Prioritized Experience Replay Memory self.memory = Memory(Config.MEMORY_SIZE) def append_sample(self, state, action, next_state, reward): """ save sample (error,<s,a,s',r>) to the replay memory """ # Define if is the end of the simulation done = True if next_state is None else False # Compute Q(s_t, a) - the model computes Q(s_t), then we select the columns of actions taken state_action_values = self.policy_net(state) state_action_values = state_action_values.gather(1, action.view(-1,1)) if not done: # Compute argmax Q(s', a; θ) next_state_actions = self.policy_net(next_state).max(1)[1].detach().unsqueeze(1) # Compute Q(s', argmax Q(s', a; θ), θ-) next_state_values = self.target_net(next_state).gather(1, next_state_actions).squeeze(1).detach() # Compute the expected Q values expected_state_action_values = (next_state_values * Config.GAMMA) + reward else: expected_state_action_values = reward error = abs(state_action_values - expected_state_action_values).data.cpu().numpy() self.memory.add(error, state, action, next_state, reward) def select_action(self, state, train=True): """ Selet the best action according to the Q-values outputed from the neural network Parameters ---------- state: float ndarray The current state on the simulation train: bool Define if we are evaluating or trainning the model Returns ------- a.max(1)[1]: int The action with the highest Q-value a.max(0): float The Q-value of the action taken """ global steps_done sample = random.random() #1. Perform a epsilon-greedy algorithm #a. set the value for epsilon self.epsilon = Config.EPS_END + (Config.EPS_START - Config.EPS_END) * \ math.exp(-1. * self.steps_done / Config.EPS_DECAY) self.steps_done += 1 #b. make the decision for selecting a random action or selecting an action from the neural network if sample > self.epsilon or (not train): # select an action from the neural network with torch.no_grad(): # a <- argmax Q(s, theta) a = self.policy_net(state) return a.max(1)[1].view(1, 1), a.max(0) else: # select a random action print('random action') return torch.tensor([[random.randrange(2)]], device=self.device, dtype=torch.long), None """ def select_action(self, state, train=True): Selet the best action according to the Q-values outputed from the neural network Parameters ---------- state: float ndarray The current state on the simulation train: bool Define if we are evaluating or trainning the model Returns ------- a.max(1)[1]: int The action with the highest Q-value a.max(0): float The Q-value of the action taken global steps_done sample = random.random() #1. Perform a epsilon-greedy algorithm #a. set the value for epsilon self.epsilon = Config.EPS_END + (Config.EPS_START - Config.EPS_END) * \ math.exp(-1. * self.steps_done / Config.EPS_DECAY) self.steps_done += 1 #b. make the decision for selecting a random action or selecting an action from the neural network if sample > self.epsilon or (not train): # select an action from the neural network with torch.no_grad(): # a <- argmax Q(s, theta) #set the network to train mode is important to enable dropout self.policy_net.train() output_list = [] # Retrieve the outputs from neural network feedfoward n times to build a statistic model for i in range(Config.STOCHASTIC_PASSES): #print(agent.policy_net(data)) output_list.append(torch.unsqueeze(F.softmax(self.policy_net(state)), 0)) #print(output_list[i]) self.policy_net.eval() # The result of the network is the mean of n passes output_mean = torch.cat(output_list, 0).mean(0) q_value = output_mean.data.cpu().numpy().max() action = output_mean.max(1)[1].view(1, 1) uncertainty = torch.cat(output_list, 0).var(0).mean().item() return action, q_value, uncertainty else: # select a random action print('random action') return torch.tensor([[random.randrange(2)]], device=self.device, dtype=torch.long), None, None """ def optimize_model(self): """ Perform one step of optimization on the neural network """ if self.memory.tree.n_entries < Config.BATCH_SIZE: return transitions, idxs, is_weights = self.memory.sample(Config.BATCH_SIZE) # Transpose the batch (see http://stackoverflow.com/a/19343/3343043 for detailed explanation). batch = Transition(*zip(*transitions)) # Compute a mask of non-final states and concatenate the batch elements non_final_mask = torch.tensor(tuple(map(lambda s: s is not None, batch.next_state)), device=self.device, dtype=torch.uint8) non_final_next_states = torch.cat([s for s in batch.next_state if s is not None]) state_batch = torch.cat(batch.state) action_batch = torch.cat(batch.action) reward_batch = torch.cat(batch.reward) # Compute Q(s_t, a) - the model computes Q(s_t), then we select the columns of actions taken state_action_values = self.policy_net(state_batch).gather(1, action_batch) # Compute argmax Q(s', a; θ) next_state_actions = self.policy_net(non_final_next_states).max(1)[1].detach().unsqueeze(1) # Compute Q(s', argmax Q(s', a; θ), θ-) next_state_values = torch.zeros(Config.BATCH_SIZE, device=self.device) next_state_values[non_final_mask] = self.target_net(non_final_next_states).gather(1, next_state_actions).squeeze(1).detach() # Compute the expected Q values expected_state_action_values = (next_state_values * Config.GAMMA) + reward_batch # Update priorities errors = torch.abs(state_action_values.squeeze() - expected_state_action_values).data.cpu().numpy() # update priority for i in range(Config.BATCH_SIZE): idx = idxs[i] self.memory.update(idx, errors[i]) # Compute Huber loss loss = F.smooth_l1_loss(state_action_values, expected_state_action_values.unsqueeze(1)) loss_return = loss.item() # Optimize the model self.optimizer.zero_grad() loss.backward() for param in self.policy_net.parameters(): param.grad.data.clamp_(-1, 1) self.optimizer.step() return loss_return def save(self, step, logs_path, label): """ Save the model on hard disc Parameters ---------- step: int current step on the simulation logs_path: string path to where we will store the model label: string label that will be used to store the model """ os.makedirs(logs_path + label, exist_ok=True) full_label = label + str(step) + '.pth' logs_path = os.path.join(logs_path, label, full_label) self.policy_net.save(logs_path, step=step, optimizer=self.optimizer) def restore(self, logs_path): """ Load the model from hard disc Parameters ---------- logs_path: string path to where we will store the model """ self.policy_net.load(logs_path) self.target_net.load(logs_path)