示例#1
0
    def __init__(self,
                 image_shape,
                 output_size,
                 capacity=int(1e6),
                 learning_rate=1e-3):
        self.output_size = output_size
        self.access = Access(capacity)
        self.value_net = DQN(image_shape, output_size)
        self.target_net = deepcopy(self.value_net)
        # 自动使用gpu
        self.gpu = torch.cuda.is_available()
        if self.gpu:
            self.value_net.cuda()
            self.target_net.cuda()

        self.optimizer = torch.optim.Adam(self.value_net.parameters(),
                                          lr=learning_rate)
        self.loss_func = nn.MSELoss()
示例#2
0
    def __init__(self, state_size, action_size, access_size=1024):
        self.state_size = state_size
        self.action_size = action_size
        self.noise = Noise(action_size)
        self.access = Access(access_size)

        self.actor = ActorNet(state_size, action_size)
        self.target_actor = deepcopy(self.actor)
        self.actor_optimizer = Adam(self.actor.parameters(), LEARNING_RATE)

        self.critic = CriticNet(state_size, action_size)
        self.target_critic = deepcopy(self.critic)
        self.critic_optimizer = Adam(self.critic.parameters(), LEARNING_RATE)

        if torch.cuda.is_available():
            self.actor.cuda()
            self.target_actor.cuda()
            self.critic.cuda()
            self.target_critic.cuda()
示例#3
0
    def __init__(self, state_size, action_size,
                 access_size=ACCESS_SIZE):
        self.state_size = state_size
        self.action_size = action_size
        self.noise = Noise(action_size)
        self.access = Access(access_size)

        self.actor = ActorNet(state_size, action_size)
        self.target_actor = deepcopy(self.actor)
        self.actor_optimizer = Adam(
            self.actor.parameters(), LR_ACTOR)

        self.critic = CriticNet(state_size, action_size)
        self.target_critic = deepcopy(self.critic)
        self.critic_optimizer = Adam(
            self.critic.parameters(), LR_CRITIC)

        if torch.cuda.is_available():
            self.actor.cuda()
            self.target_actor.cuda()
            self.critic.cuda()
            self.target_critic.cuda()
示例#4
0
class Agent(object):
    def __init__(self,
                 image_shape,
                 output_size,
                 capacity=int(1e6),
                 learning_rate=1e-3):
        self.output_size = output_size
        self.access = Access(capacity)
        self.value_net = DQN(image_shape, output_size)
        self.target_net = deepcopy(self.value_net)
        # 自动使用gpu
        self.gpu = torch.cuda.is_available()
        if self.gpu:
            self.value_net.cuda()
            self.target_net.cuda()

        self.optimizer = torch.optim.Adam(self.value_net.parameters(),
                                          lr=learning_rate)
        self.loss_func = nn.MSELoss()

    def get_deterministic_policy(self, x):
        x = Variable(torch.from_numpy(x.astype(np.float32)))
        if not self.gpu:
            out = self.value_net(x).data.numpy()
            return np.argmax(out, axis=1)
        else:
            x = x.cuda()
            out = self.value_net(x)
            out = out.cpu().data.numpy()
            return np.argmax(out, axis=1)

    def get_stochastic_policy(self, x):
        x = Variable(torch.from_numpy(x.astype(np.float32)))
        if not self.gpu:
            out = softmax(self.value_net(x), 1)
            out = out.data.numpy()
            return np.random.choice(self.output_size, 1, p=out[0])[0]
        else:
            x = x.cuda()
            out = softmax(self.value_net(x), 1)
            out = out.cpu().data.numpy()
            return np.random.choice(self.output_size, 1, p=out[0])[0]

    def get_epsilon_policy(self, x, epsilon=0.9):
        if np.random.uniform() > epsilon:
            return np.random.randint(self.output_size)
        else:
            return self.get_stochastic_policy(x)

    def optimize(self, batch_size=64, gamma=.9):
        batch = self.sample(batch_size)
        if self.gpu:
            state, action, reward,  done, next_state = \
                [Variable(torch.from_numpy(np.float32(i))).cuda() for i in batch]
            action = action.type(torch.LongTensor).cuda()
        else:
            state, action, reward,  done, next_state = \
                [Variable(torch.from_numpy(np.float32(i))) for i in batch]
            action = action.type(torch.LongTensor)

        value = self.value_net(state).gather(1, action.unsqueeze(1))
        next_value = self.target_net(next_state).detach()
        next_value = next_value.max(1)[0].view([-1, 1])
        value = value.squeeze(1)
        next_value = next_value.squeeze(1)
        target = done * reward + (1 - done) * (reward + gamma * next_value)
        loss = self.loss_func(value, target)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

    def _update_target(self):
        # update target network parameters
        for t, s in zip(self.target_net.parameters(),
                        self.value_net.parameters()):
            t.data.copy_(s.data)

    def append(self, *args):
        self.access.append(*args)

    def sample(self, batch_size=128):
        return self.access.sample(batch_size)
示例#5
0
    def run(self, sess, max_episodes, t_max=8):
        episode_score_list = []
        episode = 0
        while episode < max_episodes:
            episode += 1
            episode_socre, _ = self.run_episode(sess, t_max)
            episode_score_list.append(episode_socre)
            GD[str(self.name)] = episode_score_list
            if self.name == 'W0':
                print('Episode: %f, score: %f' % (episode, episode_socre))
                print('\n')


with tf.Session() as sess:
    with tf.device("/cpu:0"):
        A = Access(batch_size, state_size, action_size)
        F_list = []
        for i in range(NUMS_CPU):
            F_list.append(
                Worker('W%i' % i, A, batch_size, state_size, action_size))
        COORD = tf.train.Coordinator()
        sess.run(tf.global_variables_initializer())
        sess.graph.finalize()

        threads_list = []
        for ac in F_list:
            job = lambda: ac.run(sess, max_episodes)
            t = threading.Thread(target=job)
            t.start()
            threads_list.append(t)
        COORD.join(threads_list)
from agent.framework import Framework
from agent.access import Access

state_size = [50, 58, 5]
A = Access(state_size, 3)
F = Framework(A, state_size, 3, "W0")
示例#7
0
class Agent(object):
    def __init__(self, state_size, action_size,
                 access_size=ACCESS_SIZE):
        self.state_size = state_size
        self.action_size = action_size
        self.noise = Noise(action_size)
        self.access = Access(access_size)

        self.actor = ActorNet(state_size, action_size)
        self.target_actor = deepcopy(self.actor)
        self.actor_optimizer = Adam(
            self.actor.parameters(), LR_ACTOR)

        self.critic = CriticNet(state_size, action_size)
        self.target_critic = deepcopy(self.critic)
        self.critic_optimizer = Adam(
            self.critic.parameters(), LR_CRITIC)

        if torch.cuda.is_available():
            self.actor.cuda()
            self.target_actor.cuda()
            self.critic.cuda()
            self.target_critic.cuda()

    @staticmethod
    def _soft_update(target, source, tau=1e-3):
        for t, s in zip(target.parameters(), source.parameters()):
            t.data.copy_(t.data * (1.0 - tau) + s.data * tau)

    @staticmethod
    def _hard_update(target, source):
        for t, s in zip(target.parameters(), source.parameters()):
            t.data.copy_(s.data)

    def __call__(self, *args, **kwargs):
        return self.get_policy(*args)

    def append(self, *args):
        self.access.append(*args)

    def sample(self, *args):
        return self.access.sample(*args)

    def get_policy(self, state):
        state = Variable(torch.from_numpy(np.float32(state))).cuda()
        action = self.actor(state).detach()
        return action.data.cpu().numpy()

    def get_noise(self):
        return self.noise()

    def optimize(self, batch_size=64):
        batch = self.sample(batch_size)
        state, action, reward,  _, next_state =\
            [Variable(torch.from_numpy(np.float32(i))).cuda() for i in batch]

        next_action = self.target_actor.forward(next_state).detach()
        next_value = torch.squeeze(
            self.target_critic(next_state, next_action).detach())
        target_value = reward + GAMMA * next_value
        value = torch.squeeze(self.critic(state, action))

        loss_critic = nf.mse_loss(value, target_value)
        self.critic_optimizer.zero_grad()
        loss_critic.backward()
        self.critic_optimizer.step()

        policy_action = self.actor(state)
        loss_actor = -1 * torch.sum(self.critic(state, policy_action))
        self.actor_optimizer.zero_grad()
        loss_actor.backward()
        self.actor_optimizer.step()
        self._soft_update(self.target_actor, self.actor, TAU)
        self._soft_update(self.target_critic, self.critic, TAU)

    def restore_models(self, num_episode):
        self.actor.load_state_dict(torch.load(
            "actor_{}.pkl".format(num_episode)))
        self.critic.load_state_dict(torch.load(
            "critic_{}.pkl".format(num_episode)))
        self._hard_update(self.target_actor, self.actor)
        self._hard_update(self.target_critic, self.critic)

    def save_models(self, num_episode):
        torch.save(self.target_actor.state_dict(),
                   "actor_{}.pkl".format(num_episode))
        torch.save(self.target_critic.state_dict(),
                   "critic_{}.pkl".format(num_episode))
        print('Models saved successfully')
示例#8
0
class Agent(object):
    def __init__(self, state_size, action_size, access_size=1024):
        self.state_size = state_size
        self.action_size = action_size
        self.noise = Noise(action_size)
        self.access = Access(access_size)

        self.actor = ActorNet(state_size, action_size)
        self.target_actor = deepcopy(self.actor)
        self.actor_optimizer = Adam(self.actor.parameters(), LEARNING_RATE)

        self.critic = CriticNet(state_size, action_size)
        self.target_critic = deepcopy(self.critic)
        self.critic_optimizer = Adam(self.critic.parameters(), LEARNING_RATE)

        if torch.cuda.is_available():
            self.actor.cuda()
            self.target_actor.cuda()
            self.critic.cuda()
            self.target_critic.cuda()

    def __call__(self, *args, **kwargs):
        self.get_exploration_policy(*args)

    def append(self, *args):
        self.access.append(*args)

    def sample(self, *args):
        return self.access.sample(*args)

    def get_exploitation_policy(self, state):
        state = Variable(torch.from_numpy(np.float32(state))).cuda()
        action = self.target_actor(state).detach()
        return action.data.cpu().numpy()

    def get_exploration_policy(self, state):
        state = Variable(torch.from_numpy(np.float32(state))).cuda()
        action = self.actor(state).detach()
        return action.data.cpu().numpy() + self.noise()

    def optimize(self, batch_size=64):
        batch = self.sample(batch_size)
        state, action, reward,  _, next_state =\
            [Variable(torch.from_numpy(np.float32(i))).cuda() for i in batch]

        next_action = self.target_actor.forward(next_state).detach()
        next_value = torch.squeeze(
            self.target_critic(next_state, next_action).detach())
        target_value = reward + GAMMA * next_value
        value = torch.squeeze(self.critic(state, action))

        loss_critic = nf.smooth_l1_loss(value, target_value)
        self.critic_optimizer.zero_grad()
        loss_critic.backward()
        self.critic_optimizer.step()

        policy_action = self.actor(state)
        loss_actor = -1 * torch.sum(self.critic(state, policy_action))
        self.actor_optimizer.zero_grad()
        loss_actor.backward()
        self.actor_optimizer.step()
        soft_update(self.target_actor, self.actor, TAU)
        soft_update(self.target_critic, self.critic, TAU)

    def restore_models(self, num_episode):
        self.actor.load_state_dict(
            torch.load("./Models/{}_actor.pkl".format(num_episode)))
        self.critic.load_state_dict(
            torch.load("./Models/{}_critic.pkl".format(num_episode)))
        hard_update(self.target_actor, self.actor)
        hard_update(self.target_critic, self.critic)

    def save_models(self, num_episode):
        torch.save(self.target_actor.state_dict(),
                   "actor_{}.pkl".format(num_episode))
        torch.save(self.target_critic.state_dict(),
                   "critic_{}.pkl".format(num_episode))
        print('Models saved successfully')