Пример #1
0
class PPO:
    # discounted factor
    gamma = 0.99

    # l2 regulazier coefficient
    l2_reg = 0

    # learning rate
    lr = 3e-4

    # clip epsilon of ratio r(theta)
    epsilon = 0.2

    # tau for generalized advantage estimation
    tau = 0.95

    def __init__(self, env_cls, thread_num):
        """

		:param env_cls: env class or function, not instance, as we need to create several instance in class.
		:param thread_num:
		"""

        self.thread_num = thread_num
        self.env_cls = env_cls

        # we use a dummy env instance to get state dim and action dim etc. information.
        dummy_env = env_cls()

        self.s_dim = dummy_env.observation_space.shape[0]

        # if continuous action, the action_space.shape(n,) indicates number of continuous action,
        # otherwise, action_space.n stands for number of discrete action.
        is_discrete_action = len(dummy_env.action_space.shape)
        if is_discrete_action == 0:
            self.a_dim = dummy_env.action_space.n
            self.is_discrete_action = True
        else:
            self.a_dim = dummy_env.action_space.shape[0]
            self.is_discrete_action = False

        # initialize envs for each thread
        self.env_list = []
        for _ in range(thread_num):
            self.env_list.append(env_cls())

        # construct policy and value network
        self.policy = Policy(self.s_dim, self.a_dim)
        self.value = Value(self.s_dim)

        self.policy_optim = optim.Adam(self.policy.parameters(), lr=self.lr)
        self.value_optim = optim.Adam(self.value.parameters(), lr=self.lr)

        # this lock is to keep policy network weighted untouched when executing render thread.
        # The render thread depends on policy network to get latest policy and in order to make sure the policy network
        # unchanged when rendering one trajectory, we use this lock to lock the weights of policy network.
        # TODO: lock can not work yet.
        self.lock = multiprocessing.Lock()

    def est_adv(self, r, v, mask):
        """
		we save a trajectory in continuous space and it reaches the ending of current trajectory when mask=0.
		:param r: reward, Tensor, [b]
		:param v: estimated value, Tensor, [b]
		:param mask: indicates ending for 0 otherwise 1, Tensor, [b]
		:return: A(s, a), V-target(s), both Tensor
		"""
        batchsz = v.size(0)

        # v_target is worked out by Bellman equation.
        v_target = torch.Tensor(batchsz)
        delta = torch.Tensor(batchsz)
        A_sa = torch.Tensor(batchsz)

        prev_v_target = 0
        prev_v = 0
        prev_A_sa = 0
        for t in reversed(range(batchsz)):
            # mask here indicates a end of trajectory
            # this value will be treated as the target value of value network.
            # mask = 0 means the immediate reward is the real V(s) since it's end of trajectory.
            # formula: V(s_t) = r_t + gamma * V(s_t+1)
            v_target[t] = r[t] + self.gamma * prev_v_target * mask[t]

            # please refer to : https://arxiv.org/abs/1506.02438
            # for generalized adavantage estimation
            # formula: delta(s_t) = r_t + gamma * V(s_t+1) - V(s_t)
            delta[t] = r[t] + self.gamma * prev_v * mask[t] - v[t]

            # formula: A(s, a) = delta(s_t) + gamma * lamda * A(s_t+1, a_t+1)
            # here use symbol tau as lambda, but original paper uses symbol lambda.
            A_sa[t] = delta[t] + self.gamma * self.tau * prev_A_sa * mask[t]

            # update previous
            prev_v_target = v_target[t]
            prev_v = v[t]
            prev_A_sa = A_sa[t]

        # normalize A_sa
        A_sa = (A_sa - A_sa.mean()) / A_sa.std()

        return A_sa, v_target

    def update(self, batchsz):
        """
		firstly sample batchsz items and then perform optimize algorithms.
		:param batchsz:
		:return:
		"""
        # 1. sample data asynchronously
        batch = self.sample(batchsz)

        # data in batch is : batch.state: ([1, s_dim], [1, s_dim]...)
        # batch.action: ([1, a_dim], [1, a_dim]...)
        # batch.reward/ batch.mask: ([1], [1]...)
        s = torch.from_numpy(np.stack(batch.state))
        a = torch.from_numpy(np.stack(batch.action))
        r = torch.Tensor(np.stack(batch.reward))
        mask = torch.Tensor(np.stack(batch.mask))
        batchsz = s.size(0)

        # 2. get estimated V(s) and PI_old(s, a),
        # actually, PI_old(s, a) can be saved when interacting with env, so as to save the time of one forward elapsed.
        # v: [b, 1] => [b]
        v = self.value(Variable(s)).data.squeeze()
        log_pi_old_sa = self.policy.get_log_prob(Variable(s), Variable(a)).data

        # 3. estimate advantage and v_target according to GAE and Bellman Equation
        A_sa, v_target = self.est_adv(r, v, mask)

        # 4. backprop.
        # the following code episode will involve in neural network forward/backward, hence we convert related variable
        # into Variable
        v_target = Variable(v_target)
        A_sa = Variable(A_sa)
        s = Variable(s)
        a = Variable(a)
        log_pi_old_sa = Variable(log_pi_old_sa)

        for _ in range(5):

            # 4.1 shuffle current batch
            perm = torch.randperm(batchsz)
            # shuffle the variable for mutliple optimize
            v_target_shuf, A_sa_shuf, s_shuf, a_shuf, log_pi_old_sa_shuf = v_target[perm], A_sa[perm], s[perm], a[perm], \
                                                                           log_pi_old_sa[perm]

            # 4.2 get mini-batch for optimizing
            optim_batchsz = 4096
            optim_chunk_num = int(np.ceil(batchsz / optim_batchsz))
            # chunk the optim_batch for total batch
            v_target_shuf, A_sa_shuf, s_shuf, a_shuf, log_pi_old_sa_shuf = torch.chunk(v_target_shuf, optim_chunk_num), \
                                                                           torch.chunk(A_sa_shuf, optim_chunk_num), \
                                                                           torch.chunk(s_shuf, optim_chunk_num), \
                                                                           torch.chunk(a_shuf, optim_chunk_num), \
                                                                           torch.chunk(log_pi_old_sa_shuf,
                                                                                       optim_chunk_num)
            # 4.3 iterate all mini-batch to optimize
            for v_target_b, A_sa_b, s_b, a_b, log_pi_old_sa_b in zip(
                    v_target_shuf, A_sa_shuf, s_shuf, a_shuf,
                    log_pi_old_sa_shuf):
                # print('optim:', batchsz, v_target_b.size(), A_sa_b.size(), s_b.size(), a_b.size(), log_pi_old_sa_b.size())
                # 1. update value network
                v_b = self.value(s_b)
                loss = torch.pow(v_b - v_target_b, 2).mean()
                self.value_optim.zero_grad()
                loss.backward()
                # nn.utils.clip_grad_norm(self.value.parameters(), 4)
                self.value_optim.step()

                # 2. update policy network by clipping
                # [b, 1]
                log_pi_sa = self.policy.get_log_prob(s_b, a_b)
                # ratio = exp(log_Pi(a|s) - log_Pi_old(a|s)) = Pi(a|s) / Pi_old(a|s)
                # we use log_pi for stability of numerical operation
                # [b, 1] => [b]
                ratio = torch.exp(log_pi_sa - log_pi_old_sa_b).squeeze(1)
                surrogate1 = ratio * A_sa_b
                surrogate2 = torch.clamp(ratio, 1 - self.epsilon,
                                         1 + self.epsilon) * A_sa_b
                # this is element-wise comparing.
                # we add negative symbol to convert gradient ascent to gradient descent.
                surrogate = -torch.min(surrogate1, surrogate2).mean()

                # backprop
                self.policy_optim.zero_grad()
                surrogate.backward(retain_graph=True)
                # gradient clipping, for stability
                torch.nn.utils.clip_grad_norm(self.policy.parameters(), 10)
                # self.lock.acquire() # retain lock to update weights
                self.policy_optim.step()
                # self.lock.release() # release lock

    def sample(self, batchsz):
        """
		Given batchsz number of task, the batchsz will be splited equally to each threads
		and when threads return, it merge all data and return
		:param batchsz:
		:return: batch
		"""

        # batchsz will be splitted into each thread,
        # final batchsz maybe larger than batchsz parameters
        thread_batchsz = np.ceil(batchsz / self.thread_num).astype(np.int32)
        # buffer to save all data
        queue = multiprocessing.Queue()

        # start threads for pid in range(1, threadnum)
        # if threadnum = 1, this part will be ignored.
        # when save tensor in Queue, the thread should keep alive till Queue.get(),
        # please refer to : https://discuss.pytorch.org/t/using-torch-tensor-over-multiprocessing-queue-process-fails/2847/2
        evt = multiprocessing.Event()
        threads = []
        for i in range(self.thread_num):
            thread_args = (i, queue, self.env_list[i], self.policy,
                           thread_batchsz)
            threads.append(
                multiprocessing.Process(target=sampler, args=thread_args))
        for t in threads:
            # set the thread as daemon, and it will be killed once the main thread is stoped.
            t.daemon = True
            t.start()

        # we need to get the first ReplayMemory object and then merge others ReplayMemory use its append function.
        pid0, buff0, avg_reward0 = queue.get()
        avg_reward = [avg_reward0]
        for _ in range(1, self.thread_num):
            pid, buff_, avg_reward_ = queue.get()
            buff0.append(buff_)  # merge current ReplayMemory into buff0
            avg_reward.append(avg_reward_)

        # now buff saves all the sampled data and avg_reward is the average reward of current sampled data
        buff = buff0
        avg_reward = np.array(avg_reward).mean()

        print('avg reward:', avg_reward)

        return buff.sample()

    def render(self, interval=8):
        """
		call this function to start render thread.
		The function will return when the render thread started.
		:return:
		"""
        thread = multiprocessing.Process(target=self.render_,
                                         args=(interval, ))
        thread.start()

    def render_(self, interval):
        """
		This function should be called by render()
		:param interval:
		:return:
		"""
        env = self.env_cls()
        s = env.reset()

        while True:
            # [s_dim] => [1, s_dim]
            s = Variable(torch.Tensor(s)).unsqueeze(0)
            # [1, s_dim] => [1, a_dim] => [a_dim]
            a = self.policy.select_action(s).squeeze().data.numpy()
            # interact with env
            s, r, done, _ = env.step(a)

            env.render()

            if done:
                s = env.reset()
                time.sleep(interval)

    def save(self, filename='ppo'):

        torch.save(self.value.state_dict(), filename + '.val.mdl')
        torch.save(self.policy.state_dict(), filename + '.pol.mdl')

        print('saved network to mdl')

    def load(self, filename='ppo'):
        value_mdl = filename + '.val.mdl'
        policy_mdl = filename + '.pol.mdl'
        if os.path.exists(value_mdl):
            self.value.load_state_dict(torch.load(value_mdl))
            print('loaded checkpoint from file:', value_mdl)
        if os.path.exists(policy_mdl):
            self.policy.load_state_dict(torch.load(policy_mdl))

            print('loaded checkpoint from file:', policy_mdl)
Пример #2
0
class PPO:
    def __init__(self):
        """

		:param env_cls: env class or function, not instance, as we need to create several instance in class.
		:param thread_num:
		"""

        self.args = arguements.achieve_args()
        self.gamma = self.args.gamma
        self.lr = self.args.lr
        self.epsilon = self.args.epsilon
        self.tau = self.args.tau

        # construct policy and value network
        self.policy = Policy(1, 2)
        self.value = Value(1)

        self.policy_optim = optim.Adam(self.policy.parameters(), lr=self.lr)
        self.value_optim = optim.Adam(self.value.parameters(), lr=self.lr)

    def est_adv(self, r, v, mask):
        """
		we save a trajectory in continuous space and it reaches the ending of current trajectory when mask=0.
		:param r: reward, Tensor, [b]
		:param v: estimated value, Tensor, [b]
		:param mask: indicates ending for 0 otherwise 1, Tensor, [b]
		:return: A(s, a), V-target(s), both Tensor
		"""
        batchsz = v.size(0)

        # v_target is worked out by Bellman equation.
        v_target = torch.Tensor(batchsz)
        delta = torch.Tensor(batchsz)
        A_sa = torch.Tensor(batchsz)

        prev_v_target = 0
        prev_v = 0
        prev_A_sa = 0
        for t in reversed(range(batchsz)):
            # mask here indicates a end of trajectory
            # this value will be treated as the target value of value network.
            # mask = 0 means the immediate reward is the real V(s) since it's end of trajectory.
            # formula: V(s_t) = r_t + gamma * V(s_t+1)
            v_target[t] = r[t] + self.gamma * prev_v_target * mask[t]

            # formula: delta(s_t) = r_t + gamma * V(s_t+1) - V(s_t)
            delta[t] = r[t] + self.gamma * prev_v * mask[t] - v[t]

            # formula: A(s, a) = delta(s_t) + gamma * lamda * A(s_t+1, a_t+1)
            A_sa[t] = delta[t] + self.gamma * self.tau * prev_A_sa * mask[t]

            # update previous
            prev_v_target = v_target[t]
            prev_v = v[t]
            prev_A_sa = A_sa[t]

        # normalize A_sa
        A_sa = (A_sa - A_sa.mean()) / A_sa.std()

        return A_sa, v_target

    def update(self):
        """
		firstly sample batchsz items and then perform optimize algorithms.
		:param batchsz:
		:return:
		"""
        # 1. sample data asynchronously

        # batch = self.sample_original(self.args.sample_point_num)
        batch, avg_reward = sample(self.policy)
        self.avg_reward = avg_reward

        s = torch.from_numpy(np.stack(batch['state'])).view(-1, 1)
        a = torch.from_numpy(np.array(batch['action']))
        r = torch.from_numpy(np.array(batch['reward']))
        mask = torch.from_numpy(np.array(batch['done']))
        batchsz = s.size(0)

        # print('s:', s)
        # print(s.size())
        # print('a:', a)
        # print(a.size())
        # print('r:', r)
        # print(r.size())
        #
        # print('mask:', mask)
        # print(mask.size())
        #
        # print('---- batchsz:-----', batchsz)

        # exit()

        # 2. get estimated V(s) and PI_old(s, a),
        # v: [b, 1] => [b]
        v = self.value(Variable(s)).data.squeeze()
        log_pi_old_sa = self.policy.get_log_prob(Variable(s), Variable(a)).data

        # 3. estimate advantage and v_target according to GAE and Bellman Equation
        A_sa, v_target = self.est_adv(r, v, mask)

        # 4. backprop.

        v_target = Variable(v_target)
        A_sa = Variable(A_sa)
        s = Variable(s)
        a = Variable(a)
        log_pi_old_sa = Variable(log_pi_old_sa)

        for _ in range(self.args.epoch_num):

            # 4.1 shuffle current batch
            perm = torch.randperm(batchsz)
            # shuffle the variable for mutliple optimize
            v_target_shuf, A_sa_shuf, s_shuf, a_shuf, log_pi_old_sa_shuf = v_target[perm], A_sa[perm], s[perm], a[perm], \
                                                                           log_pi_old_sa[perm]

            # 4.2 get mini-batch for optimizing
            optim_batchsz = self.args.optim_batchsz
            optim_chunk_num = int(np.ceil(batchsz / optim_batchsz))
            # chunk the optim_batch for total batch
            v_target_shuf, A_sa_shuf, s_shuf, a_shuf, log_pi_old_sa_shuf = torch.chunk(v_target_shuf, optim_chunk_num), \
                                                                           torch.chunk(A_sa_shuf, optim_chunk_num), \
                                                                           torch.chunk(s_shuf, optim_chunk_num), \
                                                                           torch.chunk(a_shuf, optim_chunk_num), \
                                                                           torch.chunk(log_pi_old_sa_shuf,
                                                                                       optim_chunk_num)
            # 4.3 iterate all mini-batch to optimize
            for v_target_b, A_sa_b, s_b, a_b, log_pi_old_sa_b in zip(
                    v_target_shuf, A_sa_shuf, s_shuf, a_shuf,
                    log_pi_old_sa_shuf):
                # print('optim:', batchsz, v_target_b.size(), A_sa_b.size(), s_b.size(), a_b.size(), log_pi_old_sa_b.size())
                # 1. update value network
                v_b = self.value(s_b)
                loss = torch.pow(v_b - v_target_b, 2).mean()
                self.value_optim.zero_grad()
                loss.backward()
                self.value_optim.step()

                # 2. update policy network by clipping
                # [b, 1]
                log_pi_sa = self.policy.get_log_prob(s_b, a_b)
                # ratio = exp(log_Pi(a|s) - log_Pi_old(a|s)) = Pi(a|s) / Pi_old(a|s)
                # [b, 1] => [b]
                ratio = torch.exp(log_pi_sa - log_pi_old_sa_b).squeeze(1)
                surrogate1 = ratio * A_sa_b
                surrogate2 = torch.clamp(ratio, 1 - self.epsilon,
                                         1 + self.epsilon) * A_sa_b
                surrogate = -torch.min(surrogate1, surrogate2).mean()

                # backprop
                self.policy_optim.zero_grad()
                surrogate.backward(retain_graph=True)
                # gradient clipping, for stability
                torch.nn.utils.clip_grad_norm(self.policy.parameters(), 10)
                self.policy_optim.step()
        return self.avg_reward, self.policy

    def save(self, i, filename='ppo'):

        torch.save(self.value.state_dict(), filename + str(i) + '.val.mdl')
        torch.save(self.policy.state_dict(), filename + str(i) + '.pol.mdl')

        print('saved network to mdl')

    def load(self, filename='ppo'):
        # value_mdl = 'params005_base.val.mdl'
        # policy_mdl = 'params005_base.pol.mdl'

        # value_mdl = 'params006_480.val.mdl'
        # policy_mdl = 'params006_480.pol.mdl'

        value_mdl = 'params007_900.val.mdl'
        policy_mdl = 'params007_900.pol.mdl'

        if os.path.exists(value_mdl):
            self.value.load_state_dict(torch.load(value_mdl))
            print('loaded checkpoint from file:', value_mdl)
        if os.path.exists(policy_mdl):
            self.policy.load_state_dict(torch.load(policy_mdl))

            print('loaded checkpoint from file:', policy_mdl)
        return self.value, self.policy