def __init__(self, state_size, action_size, hiddens, args, seed, prev_means=None): self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.hiddens = hiddens self.BUFFER_SIZE = args["BUFFER_SIZE"] self.BATCH_SIZE = args["BATCH_SIZE"] self.GAMMA = args["GAMMA"] self.UPDATE_EVERY = args["UPDATE_EVERY"] self.LR = args["LR"] self.TAU = args["TAU"] self.print_graph_bol = False self.task_idx = 0 self.qnetwork_local = Bayesian_QNetwork(input_size=state_size, output_size=action_size, hidden_size=hiddens, seed=seed, prev_means=prev_means) self.qnetwork_target = Bayesian_QNetwork(input_size=state_size, output_size=action_size, hidden_size=hiddens, seed=seed, prev_means=prev_means) self.optimizer = optim.Adam(self.qnetwork_local.weights, lr=self.LR) self.memory = ReplayBuffer(action_size, self.BUFFER_SIZE, self.BATCH_SIZE, seed) self.t_step = 0
def __init__(self, state_size, action_size, N, Vmin, Vmax, hiddens, args, seed): self.state_size = state_size self.action_size = action_size self.seed = seed random.seed(seed) self.hiddens = hiddens self.BUFFER_SIZE = args["BUFFER_SIZE"] self.BATCH_SIZE = args["BATCH_SIZE"] self.GAMMA = args["GAMMA"] self.UPDATE_EVERY = args["UPDATE_EVERY"] self.UPDATE_TARGET = args["UPDATE_TARGET"] self.LR = args["LR"] self.TAU = args["TAU"] self.N = N self.Vmin = Vmin self.Vmax = Vmax self.delta_z = (Vmax - Vmin) / (N - 1) self.range_batch = torch.arange(self.BATCH_SIZE).long().to(device) self.qnetwork_local = Distrib_QNetwork(state_size, action_size, self.N, hiddens, seed).to(device) self.qnetwork_target = Distrib_QNetwork(state_size, action_size, self.N, hiddens, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=self.LR) self.memory = ReplayBuffer(action_size, self.BUFFER_SIZE, self.BATCH_SIZE, seed) self.t_step = 0
def __init__(self, state_size, action_size, hiddens, args, seed): self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.hiddens = hiddens self.BUFFER_SIZE = args["BUFFER_SIZE"] self.BATCH_SIZE = args["BATCH_SIZE"] self.GAMMA = args["GAMMA"] self.UPDATE_EVERY = args["UPDATE_EVERY"] self.LR = args["LR"] self.TAU = args["TAU"] self.qnetwork_local = QNetwork(state_size, action_size, hiddens, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, hiddens, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=self.LR) self.memory = ReplayBuffer(action_size, self.BUFFER_SIZE, self.BATCH_SIZE, seed) self.t_step = 0
def __init__(self, state_size, action_size, actions_range, hiddens, args, seed): self.state_size = state_size self.action_size = action_size self.actions_range = actions_range self.seed = seed random.seed(seed) self.hiddens = hiddens self.ou_theta = args["ou_theta"] self.ou_sigma = args["ou_sigma"] self.BUFFER_SIZE = args["BUFFER_SIZE"] self.BATCH_SIZE = args["BATCH_SIZE"] self.GAMMA = args["GAMMA"] self.UPDATE_EVERY = args["UPDATE_EVERY"] self.WARM_UP = args["WARM_UP"] self.LR = args["LR"] self.TAU = args["TAU"] self.grad_norm = args["grad_norm"] self.qnetwork_local = picnn_network(input_shape=state_size, action_shape=action_size, actions_range=actions_range, hiddens=hiddens, seed=seed).to(device) self.qnetwork_target = picnn_network(input_shape=state_size, action_shape=action_size, actions_range=actions_range, hiddens=hiddens, seed=seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=self.LR) self.memory = ReplayBuffer(action_size, self.BUFFER_SIZE, self.BATCH_SIZE, seed) self.t_step = 0
class Distrib_learner(): def __init__(self, state_size, action_size, N, Vmin, Vmax, hiddens, args, seed): self.state_size = state_size self.action_size = action_size self.seed = seed random.seed(seed) self.hiddens = hiddens self.BUFFER_SIZE = args["BUFFER_SIZE"] self.BATCH_SIZE = args["BATCH_SIZE"] self.GAMMA = args["GAMMA"] self.UPDATE_EVERY = args["UPDATE_EVERY"] self.UPDATE_TARGET = args["UPDATE_TARGET"] self.LR = args["LR"] self.TAU = args["TAU"] self.N = N self.Vmin = Vmin self.Vmax = Vmax self.delta_z = (Vmax - Vmin) / (N - 1) self.range_batch = torch.arange(self.BATCH_SIZE).long().to(device) self.qnetwork_local = Distrib_QNetwork(state_size, action_size, self.N, hiddens, seed).to(device) self.qnetwork_target = Distrib_QNetwork(state_size, action_size, self.N, hiddens, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=self.LR) self.memory = ReplayBuffer(action_size, self.BUFFER_SIZE, self.BATCH_SIZE, seed) self.t_step = 0 #self.t_tot = 0 def step(self, state, action, reward, next_state, done): self.memory.add(state, action, reward, next_state, done) #self.t_tot+=1 self.t_step = (self.t_step + 1) % self.UPDATE_EVERY #self.update_target = (self.t_tot + 1) % self.UPDATE_TARGET if self.t_step == 0: if len(self.memory) > self.BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, self.GAMMA) def act(self, state, eps=0.): state = torch.from_numpy(state).float().unsqueeze(0).to(device)[0][0] self.qnetwork_local.eval() with torch.no_grad(): z_dist = torch.from_numpy( np.array([[ self.Vmin + i * self.delta_z for i in range(self.N) ]])).to(device) z_dist = torch.unsqueeze(z_dist, 2).float() Q_dist, _ = self.qnetwork_local(state) # Q_dist = Q_dist.detach() Q_dist = Q_dist #.reshape(-1, self.action_size, self.N) Q_target = torch.matmul(Q_dist, z_dist).squeeze(1) a_star = torch.argmax(Q_target, dim=1)[0] if eps != 0.: self.qnetwork_local.train() if random.random() > eps: return a_star.cpu().data.numpy()[0] else: return random.choice(np.arange(self.action_size)) def learn(self, experiences): states, actions, rewards, next_states, dones = experiences actions = actions.long() z_dist = torch.from_numpy( np.array([[self.Vmin + i * self.delta_z for i in range(self.N)]] * self.BATCH_SIZE)).to(device) z_dist = torch.unsqueeze(z_dist, 2).float() _, log_Q_dist_prediction = self.qnetwork_local(states) log_Q_dist_prediction = log_Q_dist_prediction[ self.range_batch, actions.squeeze( 1 ), :] #.reshape(-1, self.action_size, self.N)[self.range_batch, actions.squeeze(1), :] Q_dist_target, _ = self.qnetwork_target(next_states) Q_dist_target = Q_dist_target.detach() Q_dist_target = Q_dist_target #.reshape(-1, self.action_size, self.N) Q_target = torch.matmul(Q_dist_target, z_dist).squeeze(1) a_star = torch.argmax(Q_target, dim=1) Q_dist_star = Q_dist_target[self.range_batch, a_star.squeeze(1), :] m = torch.zeros(self.BATCH_SIZE, self.N).to(device) for j in range(self.N): T_zj = torch.clamp(rewards + self.GAMMA * (1 - dones) * (self.Vmin + j * self.delta_z), min=self.Vmin, max=self.Vmax) bj = (T_zj - self.Vmin) / self.delta_z l = bj.floor().long() u = bj.ceil().long() mask_Q_l = torch.zeros(m.size()).to(device) mask_Q_l.scatter_(1, l, Q_dist_star[:, j].unsqueeze(1)) mask_Q_u = torch.zeros(m.size()).to(device) mask_Q_u.scatter_(1, u, Q_dist_star[:, j].unsqueeze(1)) m += mask_Q_l * (u.float() + (l == u).float() - bj.float()) m += mask_Q_u * (-l.float() + bj.float()) loss = -torch.sum(torch.sum(torch.mul(log_Q_dist_prediction, m), -1), -1) / self.BATCH_SIZE self.optimizer.zero_grad() loss.backward() #torch.nn.utils.clip_grad_norm_(self.qnetwork_local.parameters(), 5) self.optimizer.step() #if self.update_target == 0: # self.hard_update(self.qnetwork_local, self.qnetwork_target) self.soft_update(self.qnetwork_local, self.qnetwork_target, self.TAU) def soft_update(self, local_model, target_model, tau): for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def hard_update(self, local_model, target_model): for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(local_param.data)
class Q_learner(): def __init__(self, state_size, action_size, hiddens, args, seed, zerocenter): self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.hiddens = hiddens self.BUFFER_SIZE = args["BUFFER_SIZE"] self.BATCH_SIZE = args["BATCH_SIZE"] self.GAMMA = args["GAMMA"] self.UPDATE_EVERY = args["UPDATE_EVERY"] self.LR = args["LR"] self.TAU = args["TAU"] self.zerocenter = zerocenter self.qnetwork_local = QNetwork(state_size, action_size, hiddens, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, hiddens, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=self.LR) self.memory = ReplayBuffer(action_size, self.BUFFER_SIZE, self.BATCH_SIZE, seed) self.t_step = 0 def step(self, state, action, reward, next_state, done): self.memory.add(state, action, reward, next_state, done) self.t_step = (self.t_step + 1) % self.UPDATE_EVERY if self.t_step == 0: if len(self.memory) > self.BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, self.GAMMA) def act(self, state, eps=0.): state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): states, actions, rewards, next_states, dones = experiences actions = actions.long() Q_targets_next = self.qnetwork_target(next_states).detach().max( 1)[0].unsqueeze(1) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) Q_expected = self.qnetwork_local(states).gather(1, actions) loss = F.mse_loss(Q_expected, Q_targets) self.optimizer.zero_grad() loss.backward() self.optimizer.step() self.soft_update(self.qnetwork_local, self.qnetwork_target, self.TAU) def soft_update(self, local_model, target_model, tau): for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class picnn_learner(): def __init__(self, state_size, action_size, actions_range, hiddens, args, seed): self.state_size = state_size self.action_size = action_size self.actions_range = actions_range self.seed = seed random.seed(seed) self.hiddens = hiddens self.ou_theta = args["ou_theta"] self.ou_sigma = args["ou_sigma"] self.BUFFER_SIZE = args["BUFFER_SIZE"] self.BATCH_SIZE = args["BATCH_SIZE"] self.GAMMA = args["GAMMA"] self.UPDATE_EVERY = args["UPDATE_EVERY"] self.WARM_UP = args["WARM_UP"] self.LR = args["LR"] self.TAU = args["TAU"] self.grad_norm = args["grad_norm"] self.qnetwork_local = picnn_network(input_shape=state_size, action_shape=action_size, actions_range=actions_range, hiddens=hiddens, seed=seed).to(device) self.qnetwork_target = picnn_network(input_shape=state_size, action_shape=action_size, actions_range=actions_range, hiddens=hiddens, seed=seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=self.LR) self.memory = ReplayBuffer(action_size, self.BUFFER_SIZE, self.BATCH_SIZE, seed) self.t_step = 0 def step(self, state, action, reward, next_state, done): self.memory.add(state, action, reward, next_state, done) self.t_step = (self.t_step + 1) % self.UPDATE_EVERY if self.t_step == 0: if len(self.memory) > self.WARM_UP: experiences = self.memory.sample() self.learn(experiences) def act(self, state, test=False): actions_min, actions_max = self.actions_range state = torch.from_numpy(state).float().unsqueeze(0).to(device) #self.qnetwork_local.eval() #with torch.no_grad(): a_star = self.qnetwork_local.best_action(state)[ "actions"] #.cpu().data.numpy()[0] if not test: #self.qnetwork_local.train() x = getattr(self, "noise", a_star.clone().zero_()) mu = a_star.clone().zero_() dx = self.ou_theta * (mu - x) + self.ou_sigma * x.clone().normal_() self.noise = x + dx a_star += self.noise return a_star def learn(self, experiences): states, actions, rewards, next_states, dones = experiences next_actions = self.qnetwork_local.best_action(next_states)["actions"] max_Q = self.qnetwork_target.forward(observation=next_states, actions=next_actions, entropy=True)["Q"][0][0].detach() targets = rewards + (1 - dones) * self.GAMMA * max_Q predictions = self.qnetwork_local.forward(observation=states, actions=actions, entropy=True)["Q"][0][0] loss = torch.mean((targets - predictions)**2) self.optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(self.qnetwork_local.parameters(), self.grad_norm) self.optimizer.step() self.soft_update(self.qnetwork_local, self.qnetwork_target, self.TAU) def soft_update(self, local_model, target_model, tau): for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def hard_update(self, local_model, target_model): for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(local_param.data)
class BQ_learner(): def __init__(self, state_size, action_size, hiddens, args, seed, prev_means=None): self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.hiddens = hiddens self.BUFFER_SIZE = args["BUFFER_SIZE"] self.BATCH_SIZE = args["BATCH_SIZE"] self.GAMMA = args["GAMMA"] self.UPDATE_EVERY = args["UPDATE_EVERY"] self.LR = args["LR"] self.TAU = args["TAU"] self.print_graph_bol = False self.task_idx = 0 self.qnetwork_local = Bayesian_QNetwork(input_size=state_size, output_size=action_size, hidden_size=hiddens, seed=seed, prev_means=prev_means) self.qnetwork_target = Bayesian_QNetwork(input_size=state_size, output_size=action_size, hidden_size=hiddens, seed=seed, prev_means=prev_means) self.optimizer = optim.Adam(self.qnetwork_local.weights, lr=self.LR) self.memory = ReplayBuffer(action_size, self.BUFFER_SIZE, self.BATCH_SIZE, seed) self.t_step = 0 def next_task(self): self.qnetwork_local.update_prior() self.qnetwork_target.update_prior() self.qnetwork_local.create_head() self.qnetwork_target.create_head() self.full_update() def step(self, state, action, reward, next_state, done): self.memory.add(state, action, reward, next_state, done) self.t_step = (self.t_step + 1) % self.UPDATE_EVERY if self.t_step == 0: if len(self.memory) > self.BATCH_SIZE: experiences = self.memory.sample() mean_variance = self.learn(experiences, self.GAMMA) return mean_variance def act(self, state, task_idx=0, eps=0.): self.task_idx = task_idx state = torch.from_numpy(state).float().unsqueeze(0).to(device) with torch.no_grad(): action_values = self.qnetwork_local.forward( state, no_samples=self.qnetwork_local.no_samples_test, task_idx=task_idx) if random.random() > eps: act = np.argmax(action_values.cpu().data.numpy()) return act else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): states, actions, rewards, next_states, dones = experiences actions = actions.long() ##TODO: sample different parameters for each element from the batch no_samples_Q_target = 1 Q_targets_next = self.qnetwork_target.forward( next_states, task_idx=self.task_idx, no_samples=no_samples_Q_target).view( -1, 2).detach().max(1)[0].unsqueeze(1) Q_targets = rewards.repeat([no_samples_Q_target, 1]) + ( gamma * Q_targets_next * (1 - dones.repeat([no_samples_Q_target, 1]))) """ #unparallelized no_samples_Q_target = 1 Q_targets_next = self.qnetwork_target.forward_diff_params(next_states, no_samples=1).detach().max(1)[0].unsqueeze(1) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) """ loss = self.qnetwork_local.get_loss(states, actions, Q_targets, no_samples_Q_target) if self.print_graph_bol: # Just if you want to see the computational graph # mf_model.get_loss(torch.Tensor(x_train).to(device), torch.Tensor(y_train).to(device), task_id), params=params) print_graph(self.qnetwork_local, loss) self.print_graph_bol = False self.optimizer.zero_grad() loss.backward() self.optimizer.step() self.soft_update(self.qnetwork_local, self.qnetwork_target, self.TAU) mean_variance = self.qnetwork_local.get_variance() return mean_variance def soft_update(self, local_model, target_model, tau): for target_param, local_param in zip(target_model.weights, local_model.weights): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def full_update(self): for target_param, local_param in zip(self.qnetwork_target.weights, self.qnetwork_local.weights): target_param.data.copy_(local_param.data)
class BBB_learner(): def __init__(self, state_size, action_size, N, Vmin, Vmax, hiddens, args, seed): self.state_size = state_size self.action_size = action_size self.seed = seed random.seed(seed) self.hiddens = hiddens self.BUFFER_SIZE = args["BUFFER_SIZE"] self.BATCH_SIZE = args["BATCH_SIZE"] self.GAMMA = args["GAMMA"] self.UPDATE_EVERY = args["UPDATE_EVERY"] self.UPDATE_TARGET = args["UPDATE_TARGET"] self.LR = args["LR"] self.TAU = args["TAU"] self.N = N self.Vmin = Vmin self.Vmax = Vmax self.delta_z = (Vmax-Vmin)/(N-1) self.range_batch = torch.arange(self.BATCH_SIZE).long().to(device) self.qnetwork_local = Distrib_QNetwork(state_size, action_size , self.N, hiddens, seed).to(device) self.qnetwork_target = Distrib_QNetwork(state_size, action_size , self.N, hiddens, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=self.LR) self.memory = ReplayBuffer(action_size, self.BUFFER_SIZE, self.BATCH_SIZE, seed) self.t_step = 0 def step(self, state, action, reward, next_state, done): self.memory.add(state, action, reward, next_state, done) #self.t_tot+=1 self.t_step = (self.t_step + 1) % self.UPDATE_EVERY #self.update_target = (self.t_tot + 1) % self.UPDATE_TARGET if self.t_step == 0: if len(self.memory) > self.BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, self.GAMMA) def act(self, state, eps=0.): state = torch.from_numpy(state).float().unsqueeze(0).to(device)[0][0] self.qnetwork_local.eval() with torch.no_grad(): z_dist = torch.from_numpy( np.array([[self.Vmin + i * self.delta_z for i in range(self.N)]])).to(device) z_dist = torch.unsqueeze(z_dist, 2).float() Q_dist, _ = self.qnetwork_local(state)# Q_dist = Q_dist.detach() Q_dist = Q_dist#.reshape(-1, self.action_size, self.N) Q_target = torch.matmul(Q_dist, z_dist).squeeze(1) a_star = torch.argmax(Q_target, dim=1)[0] if eps != 0.: self.qnetwork_local.train() if random.random() > eps: return a_star.cpu().data.numpy()[0] else: return random.choice(np.arange(self.action_size)) def learn(self, experiences): states, actions, rewards, next_states, dones = experiences loss = None self.optimizer.zero_grad() loss.backward() #torch.nn.utils.clip_grad_norm_(self.qnetwork_local.parameters(), 5) self.optimizer.step() self.soft_update(self.qnetwork_local, self.qnetwork_target, self.TAU) def soft_update(self, local_model, target_model, tau): for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def hard_update(self, local_model, target_model): for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):