class DoubleQ(Agent): def __init__(self, params, name, task, load_path=None): super(DoubleQ, self).__init__(params, name, task) self.dual = self.vPars['dual'] if self.trainMode: if self.dual: self.tarNet = DualNetwork(self.vPars, self.vTrain) self.valueNet = DualNetwork(self.vPars, self.vTrain) else: self.tarNet = Network(self.vPars, self.vTrain) self.valueNet = Network(self.vPars, self.vTrain) for target_param, param in zip(self.tarNet.parameters(), self.valueNet.parameters()): target_param.data.copy_(param.data) else: self.valueNet = Network(self.vPars, self.vTrain) self.valueNet.load_state_dict(torch.load(load_path)) self.out_n = self.vPars['neurons'][-1] self.replaceCounter = 0 self.valueLoss = [] self.avgLoss = 0 self.expSize = self.vTrain['buffer'] self.exp = Memory(size=self.expSize) self.beta = self.vPars['beta'] self.priority = self.vTrain['priority'] self.priorities = [] self.alpha = .7 self.double = self.vTrain['double'] self.update_target_network = self.vTrain['update_target_network_every'] if 'noise' in self.vTrain: self.noise = self.vTrain['noise'] else: self.noise = 0 task.initAgent(self) if not load_path: while (not self.stop): x = 1 + 1 task.postTraining() def saveModel(self): torch.save( self.valueNet.state_dict(), '/home/jimmy/Documents/Research/AN_Bridging/results/hierarchical_q_policy2.txt' ) pass def store(self, s, a, r, sprime, aprime, done): self.exp.push(s, a, r, 1 - done, aprime, sprime) if len(self.priorities) < self.expSize: self.priorities.append(1) else: self.priorities = self.priorities[1:] self.priorities.append(1) def get_q(self, s): if type(self.valueNet) == list: model_index = np.random.randint(len(self.valueNet)) net = self.valueNet[model_index] else: net = self.valueNet q = net(torch.FloatTensor(s)) q = q.detach() return q def get_action(self, s, testing_time=False, probabilistic=False): i = np.random.random() if i < self.explore and self.trainMode and not testing_time: index = np.random.randint(self.out_n) else: q = self.get_q(s) if probabilistic: q = q.numpy() q = q - np.max(q) probs = np.exp(q * self.beta) probs = probs / np.sum(probs) index = np.random.choice(q.size, p=probs.ravel()) # print('probability chosen ', probs.ravel()[index]) else: index = np.argmax(q.numpy()) self.explore = max(.2, self.explore * .9997) return index def get_q_and_q_tar(self, states, actions, nextStates, rewards, masks): qValues = self.valueNet( torch.FloatTensor(states).squeeze(1)) #pass in. Processing implied q = torch.gather( qValues, 1, torch.LongTensor(actions).unsqueeze(1)) #get q values of actions qnext = self.tarNet(torch.FloatTensor(nextStates)) qnext = qnext.squeeze(1).detach() #pass in if self.double: qNextDouble = self.valueNet(torch.FloatTensor(nextStates)) qNextDouble = qNextDouble.squeeze(1).detach() #pass in qnext = torch.gather( qnext, 1, torch.LongTensor(qNextDouble.argmax(1).unsqueeze(1))) qtar = torch.FloatTensor(rewards).squeeze( 1) + self.discount * torch.Tensor(masks).unsqueeze(1) * qnext else: qtar = torch.FloatTensor(rewards) + self.discount * torch.Tensor( masks).unsqueeze(1) * qnext.max(1)[0].view( self.batch_size, 1) #calculate target return q, qtar def train(self, override=False): if len(self.exp) >= 500 or override: if self.priority: loss = 0 weights = [] errors = [] assert len(self.priorities) == len(self.exp) for i in range(self.batch_size): probs = np.array( [math.pow(p, self.alpha) for p in self.priorities]) probs = probs / np.sum(probs) choice = np.random.choice(len(self.priorities), p=probs, size=1) weights.append( math.pow( len(self.priorities) * self.priorities[int(np.asscalar(choice))], -self.beta)) states, actions, rewards, masks, _, nextStates, _, _, _ = self.exp.get_transitions( choice) q, qtar = self.get_q_and_q_tar(states, actions, nextStates, rewards, masks) td = qtar - q self.priorities[int(np.asscalar(choice))] = abs(td[:, 0]) errors.append(self.valueNet.get_loss(q, qtar)) max_weight = max(weights) weights = [w / max_weight for w in weights] val_loss = sum([w * e for w, e in zip(weights, errors)]) else: states, actions, rewards, masks, _, nextStates, _, _, _ = self.exp.sample( batch=self.batch_size) if self.replaceCounter % self.update_target_network == 0: self.tarNet.load_state_dict(self.valueNet.state_dict()) self.replaceCounter = 0 if self.noise: states = np.array(states) states = states + np.random.normal(0, self.noise, states.shape) q, qtar = self.get_q_and_q_tar(states, actions, nextStates, rewards, masks) val_loss = self.valueNet.get_loss(q, qtar) self.valueNet.optimizer.zero_grad() val_loss.backward() self.valueNet.optimizer.step() self.replaceCounter += 1 self.totalSteps += 1 return val_loss
class Twin_DDPG(Agent): def __init__(self, params, name, task): super(Twin_DDPG, self).__init__(params, name, task) self.aPars = params['actPars'] self.aTrain = params['actTrain'] if self.trainMode: self.values = [ Network(self.vPars, self.vTrain), Network(self.vPars, self.vTrain) ] self.policyNet = TD3Network(self.aPars, self.aTrain) self.tarPolicy = TD3Network(self.aPars, self.aTrain) if self.load: self.load_nets() self.tarPolicy.load_state_dict(self.policyNet.state_dict()) self.tar = [ Network(self.vPars, self.vTrain), Network(self.vPars, self.vTrain) ] for i in range(len(self.values)): self.tar[i].load_state_dict(self.values[i].state_dict()) else: self.policyNet = Network(self.aPars, self.aTrain) self.policyNet.load_state_dict( torch.load( "/home/austinnguyen517/Documents/Research/BML/MultiRobot/AN_Bridging/TD3_goal_policy2.txt" )) self.base = self.vTrain['baseExplore'] self.step = self.vTrain['decay'] self.expSize = self.vTrain['buffer'] self.exp = Replay(self.expSize) self.a = self.vTrain['a'] self.tau = self.vPars['tau'] self.smooth = self.vTrain['smooth'] self.clip = self.vTrain['clip'] self.delay = self.vTrain['policy_delay'] self.mean_range = self.aPars['mean_range'] self.noise = OUNoise(self.out_n, mu=0, theta=.15, max_sigma=self.explore, min_sigma=self.base, decay=self.step) self.valueLoss = [] self.actorLoss = [] self.avgLoss = 0 self.avgActLoss = 0 task.initAgent(self) while (not self.stop): x = 1 + 1 task.postTraining() def load_nets(self): path = "/home/austinnguyen517/Documents/Research/BML/MultiRobot/AN_Bridging/TD3_goal3_" self.policyNet.load_state_dict(torch.load(path + "policy.txt")) self.values[0].load_state_dict(torch.load(path + "Qvalue1.txt")) self.values[1].load_state_dict(torch.load(path + "Qvalue2.txt")) def saveModel(self): path = "/home/austinnguyen517/Documents/Research/BML/MultiRobot/AN_Bridging/TD3_goal3_" torch.save(self.policyNet.state_dict(), path + "policy.txt") torch.save(self.values[0].state_dict(), path + "Qvalue1.txt") torch.save(self.values[1].state_dict(), path + "Qvalue2.txt") print("Network saved") pass def get_action(self): output = self.policyNet(torch.FloatTensor(s)) i = np.random.random() if i < self.explore[0]: #add in exploration TODO: put in OU noise noise = torch.from_numpy(np.random.normal(0, self.explore[1], 2)) output = output + noise output = output.float() return output[0] def train(self): if self.dataSize > 500 and self.trainMode: #iteration updates self.trainIt += 1 self.totalSteps += 1 #Unpack s, a, r, n_s, n_a, done = self.exp.get_data() noise = torch.FloatTensor( np.random.normal(0, self.smooth, n_a.shape)) c = np.random.choice(min(self.dataSize, self.expSize), self.batch_size) s = torch.FloatTensor(s[c]) a = torch.FloatTensor(a[c]) r = torch.FloatTensor(r[c]) n_s = torch.FloatTensor(n_s[c]) done = torch.FloatTensor(done[c]) n_a = self.tarPolicy(n_s).detach().numpy() #target policy smoothing n_a_ = n_a + torch.clamp(noise, -self.clip, self.clip) n_sa = torch.cat((n_s, n_a), dim=1) qtar = torch.FloatTensor(r) + self.discount * ( 1 - done) * torch.min(self.tar[0](n_sa).detach(), self.tar[1] (n_sa).detach()) #pass in #Value update sa = torch.cat((s, a), dim=1) for qnet in self.values: q = qnet(sa) loss = qnet.loss_fnc(q, qtar) qnet.optimizer.zero_grad() loss.backward() qnet.optimizer.step() qnet.scheduler.step() self.avgLoss += loss / len(self.values) #policy update if self.trainIt % self.delay == 0: act = self.policyNet(s) s_a = torch.cat((s, act), 1) q = self.values[0](s_a) policy_loss = -q.mean() self.policyNet.optimizer.zero_grad() policy_loss.backward() self.policyNet.optimizer.step() self.policyNet.scheduler.step() self.avgActLoss += policy_loss for target_param, param in zip(self.tarPolicy.parameters(), self.policyNet.parameters()): target_param.data.copy_(self.tau * param.data + (1.0 - self.tau) * target_param.data) for i in range(len(self.values)): for target_param, param in zip( self.tar[i].parameters(), self.values[i].parameters()): target_param.data.copy_(self.tau * param.data + (1.0 - self.tau) * target_param.data)
class DoubleQ(Agent): def __init__(self, params, name, task, load_path=None): super(DoubleQ, self).__init__(params, name, task) self.dual = self.vPars['dual'] if self.trainMode: if self.dual: self.tarNet = DualNetwork(self.vPars, self.vTrain) self.valueNet = DualNetwork(self.vPars, self.vTrain) else: self.tarNet = Network(self.vPars, self.vTrain) self.valueNet = Network(self.vPars, self.vTrain) for target_param, param in zip(self.tarNet.parameters(), self.valueNet.parameters()): target_param.data.copy_(param.data) else: self.valueNet = Network(self.vPars, self.vTrain) paths = [ '/home/austinnguyen517/Documents/Research/BML/MultiRobot/AN_Bridging/push_in_hole.txt', '/home/austinnguyen517/Documents/Research/BML/MultiRobot/AN_Bridging/push_in_hole2.txt' ] if not load_path: self.valueNet = [] for path in paths: self.valueNet.append(Network(self.vPars, self.vTrain)) self.valueNet[-1].load_state_dict(torch.load(path)) else: self.valueNet.load_state_dict(torch.load(load_path)) self.out_n = self.vPars['neurons'][-1] self.replaceCounter = 0 self.valueLoss = [] self.avgLoss = 0 self.expSize = self.vTrain['buffer'] self.exp = Memory(size=self.expSize) self.double = self.vTrain['double'] task.initAgent(self) if not load_path: while (not self.stop): x = 1 + 1 task.postTraining() def saveModel(self): torch.save( self.valueNet.state_dict(), '/home/austinnguyen517/Documents/Research/BML/MultiRobot/AN_Bridging/box_push_hierarchical_q_policy.txt' ) pass def store(self, s, a, r, sprime, aprime, done): self.exp.push(s, a, r, 1 - done, aprime, sprime) def get_action(self, s): i = np.random.random() if i < self.explore and self.trainMode: index = np.random.randint(self.out_n) else: if type(self.valueNet) == list: model_index = np.random.randint(len(self.valueNet)) net = self.valueNet[model_index] else: net = self.valueNet q = net(torch.FloatTensor(s)) #print(q) q = q.detach() index = np.argmax(q.numpy()) self.explore = max(.1, self.explore * .9997) return index def train(self): if len(self.exp) >= 500: states, actions, rewards, masks, _, nextStates, _, _, _ = self.exp.sample( batch=self.batch_size) if self.replaceCounter % 500 == 0: # THIS IS SET TO 200 FOR BOX PUSH TASK...SLOPE IS 500 self.tarNet.load_state_dict(self.valueNet.state_dict()) self.replaceCounter = 0 qValues = self.valueNet(torch.FloatTensor(states).squeeze( 1)) #pass in. Processing implied q = torch.gather(qValues, 1, torch.LongTensor(actions).unsqueeze( 1)) #get q values of actions qnext = self.tarNet(torch.FloatTensor(nextStates)) qnext = qnext.squeeze(1).detach() #pass in if self.double: qNextDouble = self.valueNet(torch.FloatTensor(nextStates)) qNextDouble = qNextDouble.squeeze(1).detach() #pass in qnext = torch.gather( qnext, 1, torch.LongTensor(qNextDouble.argmax(1).unsqueeze(1))) qtar = torch.FloatTensor(rewards).squeeze( 1 ) + self.discount * torch.Tensor(masks).unsqueeze(1) * qnext else: qtar = torch.FloatTensor( rewards) + self.discount * torch.Tensor( masks).unsqueeze(1) * qnext.max(1)[0].view( self.batch_size, 1) #calculate target val_loss = self.valueNet.get_loss(q, qtar) self.valueNet.optimizer.zero_grad() val_loss.backward() self.valueNet.optimizer.step() self.replaceCounter += 1 self.totalSteps += 1
class TRPOAgent(Agent): def __init__(self, params, name, task): super(TRPOAgent, self).__init__(params, name, task) self.valueNet = Network(self.vPars, self.vTrain) self.policyNet = Network(params['actPars'], params['actTrain']) self.running_state = ZFilter((self.vPars['in_n'], ), clip=5) self.running_reward = ZFilter((1, ), demean=False, clip=10) self.experience = Memory() self.valueLoss = [] self.actorLoss = [] self.avgLoss = 0 task.initAgent(self) while (not self.stop): x = 1 + 1 task.postTraining() def saveModel(self): torch.save( self.valueNet.state_dict(), "/home/austinnguyen517/Documents/Research/BML/MultiRobot/AN_Bridging/TRPOCritic.txt" ) torch.save( self.policyNet.state_dict(), "/home/austinnguyen517/Documents/Research/BML/MultiRobot/AN_Bridging/TRPOPolicy.txt" ) print("Network saved") def train(self): batch = self.experience.sample() self.update_params(batch) def store(self, prevS, prevA, r, s, a, failure): mask = 0 if failure == 1 else 1 self.experience.push(prevS, prevA, mask, s, r) def update_params(self, batch): rewards = torch.Tensor(batch.reward) masks = torch.Tensor(batch.mask) actions = torch.Tensor(np.concatenate(batch.action, 0)) states = torch.Tensor(batch.state) values = self.valueNet(Variable(states)) returns = torch.Tensor(actions.size(0), 1) deltas = torch.Tensor(actions.size(0), 1) advantages = torch.Tensor(actions.size(0), 1) prev_return = 0 prev_value = 0 prev_advantage = 0 for i in reversed(range(rewards.size(0))): returns[i] = rewards[i] + self.discount * prev_return * masks[i] deltas[i] = rewards[ i] + self.discount * prev_value * masks[i] - values.data[i] advantages[i] = deltas[ i] + self.discount * tau * prev_advantage * masks[i] prev_return = returns[i, 0] prev_value = values.data[i, 0] prev_advantage = advantages[i, 0] targets = Variable(returns) # Original code uses the same LBFGS to optimize the value loss def get_value_loss(flat_params): set_flat_params_to(self.valueNet, torch.Tensor(flat_params)) for param in self.valueNet.parameters(): if param.grad is not None: param.grad.data.fill_(0) values_ = self.valueNet(Variable(states)) value_loss = (values_ - targets).pow(2).mean() # weight decay for param in self.valueNet.parameters(): value_loss += param.pow(2).sum() * l2Reg value_loss.backward() return (value_loss.data.double().numpy(), get_flat_grad_from(self.valueNet).data.double().numpy()) flat_params, _, opt_info = scipy.optimize.fmin_l_bfgs_b( get_value_loss, get_flat_params_from(self.valueNet).double().numpy(), maxiter=25) set_flat_params_to(self.valueNet, torch.Tensor(flat_params)) advantages = (advantages - advantages.mean()) / advantages.std() output = self.policyNet(Variable(states)).view(-1, self.u_n * 2) action_means = output.narrow(1, 0, self.u_n) action_log_stds = output.narrow(1, self.u_n, self.u_n) action_stds = torch.exp(action_log_stds) fixed_log_prob = normal_log_density(Variable(actions), action_means, action_log_stds, action_stds).data.clone() def get_loss(volatile=False): if volatile: with torch.no_grad(): output = self.policyNet(Variable(states)) else: output = self.policyNet(Variable(states)) output = output.view(-1, self.u_n * 2) action_means = output.narrow(1, 0, self.u_n) action_log_stds = output.narrow(1, self.u_n, self.u_n) action_stds = torch.exp(action_log_stds) log_prob = normal_log_density(Variable(actions), action_means, action_log_stds, action_stds) action_loss = -Variable(advantages) * torch.exp( log_prob - Variable(fixed_log_prob)) return action_loss.mean() def get_kl(): output = self.policyNet(Variable(states)) output = output.view(-1, self.u_n * 2) mean1 = output.narrow(1, 0, self.u_n) log_std1 = output.narrow(1, self.u_n, self.u_n) std1 = torch.exp(action_log_stds) mean0 = Variable(mean1.data) log_std0 = Variable(log_std1.data) std0 = Variable(std1.data) kl = log_std1 - log_std0 + (std0.pow(2) + (mean0 - mean1).pow(2)) / ( 2.0 * std1.pow(2)) - 0.5 return kl.sum(1, keepdim=True) loss = trpo_step(self.policyNet, get_loss, get_kl, maxKL, damping) self.avgLoss += loss self.trainIt += 1