예제 #1
0
	def update(self, target_Q, count):
		if len(self.buffer_object) < self.params['batch_size']:
			return 0
		s_matrix, a_matrix, r_matrix, done_matrix, sp_matrix = self.buffer_object.sample(self.params['batch_size'])
		r_matrix = numpy.clip(r_matrix,
		                      a_min=-self.params['reward_clip'],
		                      a_max=self.params['reward_clip'])

		s_matrix = torch.from_numpy(s_matrix).float().to(self.device)
		a_matrix = torch.from_numpy(a_matrix).float().to(self.device)
		r_matrix = torch.from_numpy(r_matrix).float().to(self.device)
		done_matrix = torch.from_numpy(done_matrix).float().to(self.device)
		sp_matrix = torch.from_numpy(sp_matrix).float().to(self.device)

		Q_star, _ = target_Q.get_best_qvalue_and_action(sp_matrix)
		Q_star = Q_star.reshape((self.params['batch_size'], -1))
		with torch.no_grad():
			y = r_matrix + self.params['gamma'] * (1 - done_matrix) * Q_star
		y_hat = self.forward(s_matrix, a_matrix)
		loss = self.criterion(y_hat, y)
		self.zero_grad()
		loss.backward()
		self.optimizer.step()
		self.zero_grad()
		utils_for_q_learning.sync_networks(
		    target=target_Q,
		    online=self,
		    alpha=self.params['target_network_learning_rate'],
		    copy=False)
		return loss.cpu().data.numpy()
예제 #2
0
    def update(self, target_Q):

        if len(self.buffer_object.storage) < params['batch_size']:
            return
        else:
            pass
        batch = random.sample(self.buffer_object.storage, params['batch_size'])
        s_li = [b['s'] for b in batch]
        sp_li = [b['sp'] for b in batch]
        r_li = [b['r'] for b in batch]
        done_li = [b['done'] for b in batch]
        a_li = [b['a'] for b in batch]
        s_matrix = numpy.array(s_li).reshape(params['batch_size'],
                                             self.state_size)
        a_matrix = numpy.array(a_li).reshape(params['batch_size'],
                                             self.action_size)
        r_matrix = numpy.array(r_li).reshape(params['batch_size'], 1)

        r_matrix = numpy.clip(r_matrix,
                              a_min=-self.params['reward_clip'],
                              a_max=self.params['reward_clip'])
        sp_matrix = numpy.array(sp_li).reshape(params['batch_size'],
                                               self.state_size)
        done_matrix = numpy.array(done_li).reshape(params['batch_size'], 1)
        #self.train()
        Q_star = target_Q.get_best_centroid_batch(torch.FloatTensor(sp_matrix))
        #print(Q_star[0])
        #Q_star = target_Q.get_best_centroid_batch(torch.FloatTensor(sp_matrix))
        #print(Q_star[0])
        #assert False
        #assert False
        Q_star = Q_star.reshape((params['batch_size'], -1))
        #print(Q_star.shape)
        y = r_matrix + self.params['gamma'] * (1 - done_matrix) * Q_star

        y_hat = self.forward(torch.FloatTensor(s_matrix),
                             torch.FloatTensor(a_matrix))
        loss = self.criterion(y_hat, torch.FloatTensor(y))
        self.zero_grad()
        loss.backward()
        self.optimizer.step()
        self.zero_grad()
        utils_for_q_learning.sync_networks(
            target=target_Q,
            online=self,
            alpha=params['target_network_learning_rate'],
            copy=False)
예제 #3
0
	s0 = env.reset()
	utils_for_q_learning.action_checker(env)
	Q_object = Net(params,
	               env,
	               state_size=len(s0),
	               action_size=len(env.action_space.low),
	               device=device)
	Q_object_target = Net(params,
	                      env,
	                      state_size=len(s0),
	                      action_size=len(env.action_space.low),
	                      device=device)
	Q_object_target.eval()

	utils_for_q_learning.sync_networks(target=Q_object_target,
	                                   online=Q_object,
	                                   alpha=params['target_network_learning_rate'],
	                                   copy=True)

	G_li = []
	loss_li = []
	all_times_per_steps = []
	all_times_per_updates = []
	for episode in range(params['max_episode']):
		print("episode {}".format(episode))
		Q_this_episode = Net(params,
		                     env,
		                     state_size=len(s0),
		                     action_size=len(env.action_space.low),
		                     device=device)
		utils_for_q_learning.sync_networks(target=Q_this_episode,
		                                   online=Q_object,