Exemplo n.º 1
0
    def generate_expert(self):
        state, reward, done, info = self.env.reset()
        state_buffer, a16_buffer, a32_buffer, a64_buffer = [], [], [], []
        while not done:
            a0, a1, a2 = teacher.action(state, info)
            action_64 = np.zeros((scr_pixels * scr_pixels, ), dtype=np.float32)
            action_32 = np.zeros((scr_pixels * scr_pixels / 4, ),
                                 dtype=np.float32)
            action_16 = np.zeros((scr_pixels * scr_pixels / 16, ),
                                 dtype=np.float32)
            action_64[a1 * scr_pixels + a2] = 1
            action_32[a1 * scr_pixels / 4 + a2 / 2] = 1
            action_16[a1 * scr_pixels / 16 + a2 / 4] = 1
            # print(action == 1)
            state_buffer.append([state])
            a16_buffer.append([action_16])
            a32_buffer.append([action_32])
            a64_buffer.append([action_64])

            state, reward, done, info = self.env.step(
                1 if a0 == 0 else int(2 + a1 * scr_pixels + a2))
        state_buffer, a16_buffer, a32_buffer, a64_buffer = np.vstack(
            state_buffer), np.vstack(a16_buffer), np.vstack(
                a32_buffer), np.vstack(a64_buffer)
        # print(state_buffer.shape)

        return state_buffer, a16_buffer, a32_buffer, a64_buffer
Exemplo n.º 2
0
    def pre_train(self):
        global GLOBAL_RUNNING_R, GLOBAL_EP
        # self.AC.pull_global()
        total_step = 1
        buffer_s, buffer_a0, buffer_a1, buffer_a2, buffer_r, buffer_avail,buffer_a0_exp,buffer_a1_exp,buffer_a2_exp = [], [], [], [], [], [],[],[],[]
        while not COORD.should_stop() and GLOBAL_EP < MAX_GLOBAL_EP:
            state, _, _, info = self.env.reset(
            )  # timestep[0] contains rewards, observations, etc. SEE pysc2 FOR MORE INFO
            ep_r = 0
            lei_ji = 0
            while True:
                a0, a1, a2 = self.AC.choose_action([state], [info])
                a0_exp, a1_exp, a2_exp = teacher.action(state, info)
                # print(state)
                action = 1 if a0 == 0 else int(2 + a1_exp * scr_pixels +
                                               a2_exp)
                buffer_s.append([state])
                buffer_avail.append([info])
                buffer_a0.append(a0)
                buffer_a1.append(a1)
                buffer_a2.append(a2)
                buffer_a0_exp.append(a0_exp)
                buffer_a1_exp.append(a1_exp)
                buffer_a2_exp.append(a2_exp)
                state, reward, done, info = self.env.step(action)
                lei_ji += reward
                if lei_ji >= 20:
                    done = True
                if reward > 0:
                    reward = reward * (1 + ep_r * weight)
                buffer_r.append(reward)
                ep_r += reward
                if total_step % UPDATE_GLOBAL_ITER == 0 or done:
                    if done:
                        v_s_ = 0
                    else:
                        v_s_ = sess.run(self.AC.value, {self.AC.s: [state]})[0,
                                                                             0]
                    buffer_v_target = []
                    for r in buffer_r[::-1]:  # reverse buffer r
                        v_s_ = r + GAMMA * v_s_  # compute v target
                        buffer_v_target.append(v_s_)
                    buffer_v_target.reverse()

                    buffer_s, buffer_a0, buffer_a1, buffer_a2, buffer_v_target, buffer_avail, buffer_a0_exp, buffer_a1_exp, buffer_a2_exp = np.vstack(
                        buffer_s
                    ), np.vstack(buffer_a0), np.vstack(buffer_a1), np.vstack(
                        buffer_a2), np.vstack(buffer_v_target), np.vstack(
                            buffer_avail), np.vstack(buffer_a0_exp), np.vstack(
                                buffer_a1_exp), np.vstack(
                                    buffer_a2_exp
                                )  # put together into a single array
                    feed_dict = {
                        self.AC.s: buffer_s,
                        self.AC.a0: buffer_a0,
                        self.AC.a1: buffer_a1,
                        self.AC.a2: buffer_a2,
                        self.AC.a0_exp: buffer_a0_exp,
                        self.AC.a1_exp: buffer_a1_exp,
                        self.AC.a2_exp: buffer_a2_exp,
                        self.AC.v_target: buffer_v_target,
                        self.AC.available: buffer_avail,
                    }
                    test = self.AC.update_global_high(
                        feed_dict)  # update parameters
                    #closs ,aloss,exp_loss= sess.run([self.AC.c_loss,self.AC.a_loss,self.AC.exp_loss], feed_dict=feed_dict)
                    #print("c_loss:",closs,"a_loss:",aloss,"exp_loss",exp_loss)
                    #sigma_1,sigma_2 = sess.run([self.AC.sigma_1,self.AC.sigma_2],feed_dict = feed_dict)
                    entropy, aloss, td, exp_loss, prob_a = sess.run(
                        [
                            self.AC.entropy, self.AC.a_loss, self.AC.td,
                            self.AC.exp_loss, self.AC.log_prob_a
                        ],
                        feed_dict=feed_dict)

                    buffer_s, buffer_a0, buffer_a1, buffer_a2, buffer_r, buffer_avail = [], [], [], [], [], []
                    buffer_a0_exp, buffer_a1_exp, buffer_a2_exp = [], [], []
                    self.AC.pull_global()

                total_step += 1
                if done:
                    if len(GLOBAL_RUNNING_R
                           ) == 0:  # record running episode reward
                        GLOBAL_RUNNING_R.append(ep_r)
                    else:
                        GLOBAL_RUNNING_R.append(0.95 * GLOBAL_RUNNING_R[-1] +
                                                0.05 * ep_r)
                    print(
                        self.name,
                        "episode:",
                        GLOBAL_EP,
                        '| reward: %.1f' % lei_ji,
                        "| running_reward: %.1f" % GLOBAL_RUNNING_R[-1],
                        # '| sigma:', test, # debug
                    )
                    GLOBAL_EP += 1
                    print("entropy", entropy[0][0], "td", td[0], "prob_a:",
                          prob_a, "prob_exp:", exp_loss, "aloss", aloss)
                    # self.globalAC.save_ckpt()
                    # with open("/summary.txt",'w') as f:
                    #    f.write('%.lf' % ep_r)
                    if ep_r > score_high[self.hard] or ep_r < score_low[
                            self.hard]:
                        self.env.close()
                        self.hard = self.hard + 1 if ep_r > score_high[
                            self.hard] else self.hard - 1
                        self.env = wrap(game[self.hard])
                    break
Exemplo n.º 3
0
    def pre_train(self):
        global GLOBAL_RUNNING_R, GLOBAL_EP
        # self.AC.pull_global()
        total_step = 1
        buffer_s, buffer_a0, buffer_a1, buffer_a2, buffer_r, buffer_avail = [], [], [], [], [], []
        while not COORD.should_stop() and GLOBAL_EP < MAX_GLOBAL_EP:
            state, _, _, info = self.env.reset(
            )  # timestep[0] contains rewards, observations, etc. SEE pysc2 FOR MORE INFO
            ep_r = 0
            while True:
                a0, a1, a2 = teacher.action(state, info)
                # print(state)
                action = 1 if a0 == 0 else int(2 + a1 * scr_pixels + a2)
                buffer_s.append([state])
                buffer_avail.append([info])
                buffer_a0.append(a0)
                buffer_a1.append(a1)
                buffer_a2.append(a2)
                state, reward, done, info = self.env.step(action)

                buffer_r.append(reward)
                ep_r += reward
                if total_step % UPDATE_GLOBAL_ITER == 0 or done:
                    if done:
                        v_s_ = 0
                    else:
                        v_s_ = sess.run(self.AC.value, {self.AC.s: [state]})[0,
                                                                             0]
                    buffer_v_target = []
                    for r in buffer_r[::-1]:  # reverse buffer r
                        v_s_ = r + GAMMA * v_s_  # compute v target
                        buffer_v_target.append(v_s_)
                    buffer_v_target.reverse()

                    buffer_s, buffer_a0, buffer_a1, buffer_a2, buffer_v_target, buffer_avail = np.vstack(
                        buffer_s
                    ), np.vstack(buffer_a0), np.vstack(buffer_a1), np.vstack(
                        buffer_a2), np.vstack(buffer_v_target), np.vstack(
                            buffer_avail)  # put together into a single array
                    feed_dict = {
                        self.AC.s: buffer_s,
                        self.AC.a0: buffer_a0,
                        self.AC.a1: buffer_a1,
                        self.AC.a2: buffer_a2,
                        self.AC.v_target: buffer_v_target,
                        self.AC.available: buffer_avail,
                    }
                    test = self.AC.update_global_high(
                        feed_dict)  # update parameters
                    closs, aloss = sess.run([self.AC.c_loss, self.AC.a_loss],
                                            feed_dict=feed_dict)
                    print("c_loss:", closs, "a_loss:", aloss)
                    buffer_s, buffer_a0, buffer_a1, buffer_a2, buffer_r, buffer_avail = [], [], [], [], [], []
                    self.AC.pull_global()

                total_step += 1
                if done:
                    if len(GLOBAL_RUNNING_R
                           ) == 0:  # record running episode reward
                        GLOBAL_RUNNING_R.append(ep_r)
                    else:
                        GLOBAL_RUNNING_R.append(0.95 * GLOBAL_RUNNING_R[-1] +
                                                0.05 * ep_r)
                    print(
                        self.name,
                        "episode:",
                        GLOBAL_EP,
                        '| reward: %.1f' % ep_r,
                        "| running_reward: %.1f" % GLOBAL_RUNNING_R[-1],
                        # '| sigma:', test, # debug
                    )
                    GLOBAL_EP += 1
                    # self.globalAC.save_ckpt()
                    # with open("/summary.txt",'w') as f:
                    #    f.write('%.lf' % ep_r)
                    break