Пример #1
0
class Agent(object):
    def __init__(
        self,
        a_dim,
        s_dim,
        a_bound,
    ):
        self.a_dim, self.s_dim, self.a_bound = a_dim, s_dim, a_bound,
        #self.sess = tf.Session()
        self.P_online = Actor(s_dim, a_dim)
        self.P_target = Actor(s_dim, a_dim)
        self.P_target.load_state_dict(self.P_online.state_dict())
        self.Q_online = Critic(s_dim, a_dim)
        self.Q_target = Critic(s_dim, a_dim)
        self.Q_target.load_state_dict(self.Q_online.state_dict())
        self.q_optimizer = torch.optim.Adam(self.Q_online.parameters(),
                                            lr=LR_C)
        self.p_optimizer = torch.optim.Adam(self.P_online.parameters(),
                                            lr=LR_A)
        self.loss_td = nn.MSELoss()
        self.replay_buffer = ReplayBuffer()
        self.batch_size = 32

        self.discrete = False
        self.ep_step = 0
        # noise
        self.noise = Noise(DELTA, SIGMA, OU_A, OU_MU)
        # Initialize noise
        self.ou_level = 0.
        self.action_low = -2
        self.action_high = 2

    def act(self, state, test=False):
        if not test:
            with torch.no_grad():
                # boring type casting
                state = ((
                    torch.from_numpy(state)).unsqueeze(0)).float().to('cpu')
                action = self.P_online(state)  # continuous output
                a = action.data.cpu().numpy()
                if self.discrete:
                    action = np.argmax(a)
                    return a, action
                else:
                    if self.ep_step < 200:
                        self.ou_level = self.noise.ornstein_uhlenbeck_level(
                            self.ou_level)
                    action = np.clip(a + self.ou_level, self.action_low,
                                     self.action_high)
                    return (torch.from_numpy(action)).view(-1)

    def collect_data(self, state, action, reward, next_state, done):
        self.replay_buffer.push(
            torch.from_numpy(state).float().unsqueeze(0),
            torch.from_numpy(action).float().unsqueeze(0),
            torch.tensor([reward]).float().unsqueeze(0),
            torch.from_numpy(next_state).float().unsqueeze(0),
            torch.tensor([done]).float().unsqueeze(0))

    def clear_data(self):
        raise NotImplementedError("Circular Queue don't need this function")

    def update(self):
        if len(self.replay_buffer) < self.batch_size:
            return
        states, actions, rewards, next_states, dones = self.replay_buffer.sample(
            batch_size=self.batch_size, device='cpu')

        #===============================Critic Update===============================
        with torch.no_grad():
            target = rewards + GAMMA * (1 - dones) * self.Q_target(
                (next_states, self.P_target(next_states)))
        Q = self.Q_online((states, actions))
        td_error = self.loss_td(target, Q)
        self.q_optimizer.zero_grad()
        td_error.backward()
        self.q_optimizer.step()

        #===============================Actor Update===============================
        q = self.Q_online((states, self.P_online(states)))
        loss_a = -torch.mean(q)
        self.p_optimizer.zero_grad()
        loss_a.backward()
        self.p_optimizer.step()

        #===============================Target Update===============================
        soft_update(self.Q_target, self.Q_online, tau=1e-2)
        soft_update(self.P_target, self.P_online, tau=1e-2)
Пример #2
0
class Agent():
    def __init__(self, test=False):
        # device
        if torch.cuda.is_available():
            self.device = torch.device('cuda')
        else:
            self.device = torch.device('cpu')
        #########################################
        """
        Some hand tune config(for developing)
        """
        self.discrete = False
        self.action_dim = 1
        self.state_dim = 3
        self.batch_size = 100
        self.action_low = -2
        self.action_high = 2
        ##########################################
        self.P_online = Actor(state_dim=self.state_dim,
                              action_size=self.action_dim).to(self.device)
        self.P_target = Actor(state_dim=self.state_dim,
                              action_size=self.action_dim).to(self.device)
        self.P_target.load_state_dict(self.P_online.state_dict())
        self.Q_online = Critic(state_size=self.state_dim,
                               action_size=self.action_dim).to(self.device)
        self.Q_target = Critic(state_size=self.state_dim,
                               action_size=self.action_dim).to(self.device)
        self.Q_target.load_state_dict(self.Q_online.state_dict())
        # discounted reward
        self.gamma = 0.99
        self.eps = 0.25
        # optimizer
        self.q_optimizer = torch.optim.Adam(self.Q_online.parameters(),
                                            lr=1e-3)
        self.p_optimizer = torch.optim.Adam(self.P_online.parameters(),
                                            lr=1e-3)
        # saved rewards and actions
        self.replay_buffer = ReplayBuffer()

        # noise
        self.noise = Noise(DELTA, SIGMA, OU_A, OU_MU)
        # Initialize noise
        self.ou_level = 0.

        self.ep_step = 0

    def act(self, state, test=False):
        if not test:
            with torch.no_grad():
                # boring type casting
                state = ((torch.from_numpy(state)).unsqueeze(0)).float().to(
                    self.device)
                action = self.P_online(state)  # continuous output
                a = action.data.cpu().numpy()
                # if self.ep_step < 200:
                # self.ou_level = self.noise.ornstein_uhlenbeck_level(self.ou_level)
                # a = a + self.ou_level
                if self.discrete:
                    action = np.argmax(a)
                    return a, action
                else:
                    if self.ep_step < 200:
                        self.ou_level = self.noise.ornstein_uhlenbeck_level(
                            self.ou_level)
                    action = np.clip(a + self.ou_level, self.action_low,
                                     self.action_high)
                    return action, action

    def collect_data(self, state, action, reward, next_state, done):
        self.replay_buffer.push(
            torch.from_numpy(state).float().unsqueeze(0),
            torch.from_numpy(action).float(),
            torch.tensor([reward]).float().unsqueeze(0),
            torch.from_numpy(next_state).float().unsqueeze(0),
            torch.tensor([done]).float().unsqueeze(0))

    def clear_data(self):
        raise NotImplementedError("Circular Queue don't need this function")

    def update(self):
        if len(self.replay_buffer) < self.batch_size:
            return
        states, actions, rewards, next_states, dones = self.replay_buffer.sample(
            batch_size=self.batch_size, device=self.device)
        # discounted rewards
        # rewards = torch.from_numpy(discount((rewards.view(rewards.shape[0])).cpu().numpy())).float().to(self.device)

        ### debug shape : ok
        #===============================Critic Update===============================
        self.Q_online.train()
        Q = self.Q_online((states, actions))

        with torch.no_grad():  # don't need backprop for target value
            self.Q_target.eval()
            self.P_target.eval()
            target = rewards + self.gamma * (1 - dones) * self.Q_target(
                (next_states, self.P_target(next_states)))
        critic_loss_fn = torch.nn.MSELoss()
        critic_loss = critic_loss_fn(Q, target).mean()
        # update
        self.q_optimizer.zero_grad()
        critic_loss.backward()
        self.q_optimizer.step()
        # print("critic loss", critic_loss.item())

        #===============================Actor Update===============================
        # fix online_critic , update online_actor
        self.Q_online.eval()
        for p in self.Q_online.parameters():
            p.requires_grad = False
        for p in self.P_online.parameters():
            p.requires_grad = True
        policy_loss = -self.Q_online((states, self.P_online(states)))
        policy_loss = policy_loss.mean()
        self.p_optimizer.zero_grad()
        policy_loss.backward()
        self.p_optimizer.step()
        # print("policy loss", policy_loss.item())
        for p in self.Q_online.parameters():
            p.requires_grad = True
        #===============================Target Update===============================
        soft_update(self.Q_target, self.Q_online, tau=1e-3)
        soft_update(self.P_target, self.P_online, tau=1e-3)
        self.eps -= EPSILON_DECAY
        if self.eps <= 0:
            self.eps = 0
Пример #3
0
class DDPG:

    def __init__(self, sess, params):
        self.sess = sess
        self.__dict__.update(params)
        # create placeholders
        self.create_input_placeholders()
        # create actor/critic models
        self.actor = Actor(self.sess, self.inputs, **self.actor_params)
        self.critic = Critic(self.sess, self.inputs, **self.critic_params)
        self.noise_params = {k: np.array(list(map(float, v.split(","))))
                             for k, v in self.noise_params.items()}
        self.noise = Noise(**self.noise_params)
        self.ou_level = np.zeros(self.dimensions["u"])
        self.memory = Memory(self.n_mem_objects,
                             self.memory_size)

    def create_input_placeholders(self):
        self.inputs = {}
        with tf.name_scope("inputs"):
            for ip_name, dim in self.dimensions.items():
                self.inputs[ip_name] = tf.placeholder(tf.float32,
                                                      shape=(None, dim),
                                                      name=ip_name)
            self.inputs["g"] = tf.placeholder(tf.float32,
                                              shape=self.inputs["u"].shape,
                                              name="a_grad")
            self.inputs["p"] = tf.placeholder(tf.float32,
                                              shape=(None, 1),
                                              name="pred_q")

    def step(self, x, is_u_discrete, explore=True):
        x = x.reshape(-1, self.dimensions["x"])
        u = self.actor.predict(x)
        if explore:
            self.ou_level = self.noise.ornstein_uhlenbeck_level(self.ou_level)
            u = u + self.ou_level
        q = self.critic.predict(x, u)
        if is_u_discrete:
            return [np.argmax(u), u[0], q[0]]
        return [u[0], u, q[0]]

    def remember(self, experience):
        self.memory.add(experience)

    def train(self):
        # check if the memory contains enough experiences
        if self.memory.size < 3*self.b_size:
            return
        x, g, ag, u, r, nx, ng, t = self.get_batch()
        # for her transitions
        her_idxs = np.where(np.random.random(self.b_size) < 0.80)[0]
        # print("{} of {} selected for HER transitions".
        # format(len(her_idxs), self.b_size))
        g[her_idxs] = ag[her_idxs]
        r[her_idxs] = 1
        t[her_idxs] = 1
        x = np.hstack([x, g])
        nx = np.hstack([nx, ng])
        nu = self.actor.predict_target(nx)
        tq = r + self.gamma*self.critic.predict_target(nx, nu)*(1-t)
        self.critic.train(x, u, tq)
        grad = self.critic.get_action_grads(x, u)
        # print("Grads:\n", g)
        self.actor.train(x, grad)
        self.update_targets()

    def get_batch(self):
        return self.memory.sample(self.b_size)

    def update_targets(self):
        self.critic.update_target()
        self.actor.update_target()