예제 #1
0
class GameManager:
    def __init__(self):
        self.env = ENV_scene_new_action_pre_state_penalty_conflict_heuristic_transpose_shape_poly(
            size=(map_size, map_size), max_num=obj_num)
        self.reset()

    def reset(self):
        #observation = self.env.reset()
        current_num = np.random.randint(obj_num) + 1
        self.env.randominit_crowded(current_num)
        s = self.env.getstate_3()
        return s

    def step(self, action):
        choice_index = int(action / action_type)
        choice_action = action % action_type
        r, done = self.env.move(choice_index, choice_action)
        s_ = self.env.getstate_3()
        return s_, r, done
예제 #2
0
 def __init__(self, name, globalAC):
     self.env = ENV_scene_new_action_pre_state_penalty_conflict_heuristic_transpose_shape_poly(
         size=(map_size, map_size), max_num=obj_num)
     self.name = name
     self.AC = ACNet(name, globalAC)
예제 #3
0
class Worker(object):
    def __init__(self, name, globalAC):
        self.env = ENV_scene_new_action_pre_state_penalty_conflict_heuristic_transpose_shape_poly(
            size=(map_size, map_size), max_num=obj_num)
        self.name = name
        self.AC = ACNet(name, globalAC)

    def work(self):
        global GLOBAL_RUNNING_R, GLOBAL_EP
        total_step = 1
        buffer_s, buffer_a, buffer_r = [], [], []
        while not COORD.should_stop() and GLOBAL_EP < MAX_GLOBAL_EP:
            #s = self.env.reset()

            current_num = np.random.randint(obj_num) + 1
            self.env.randominit_crowded(current_num)
            s = self.env.getstate_3()
            ep_r = 0
            rnn_state = SESS.run(
                self.AC.init_state)  # zero rnn state at beginning
            keep_state = rnn_state.copy(
            )  # keep rnn state for updating global net
            for ep_t in range(MAX_EP_STEP):
                # if self.name == 'W_0':
                #     self.env.render()
                a, rnn_state_ = self.AC.choose_action(s, rnn_state)
                #s_, r, done, info = self.env.step(a)

                choice_index = int(a / action_type)
                choice_action = a % action_type
                r, done = self.env.move(choice_index, choice_action)
                s_ = self.env.getstate_3()

                done = True if ep_t == MAX_EP_STEP - 1 else False

                ep_r += r
                buffer_s.append([s])
                buffer_a.append(a)
                buffer_r.append(r)

                if total_step % UPDATE_GLOBAL_ITER == 0 or done:  # update global and assign to local net
                    if done:
                        v_s_ = 0  # terminal
                    else:
                        v_s_ = SESS.run(
                            self.AC.v, {
                                self.AC.s: s_[np.newaxis, :],
                                self.AC.init_state: rnn_state_
                            })[0, 0]
                    buffer_v_target = []
                    for r in buffer_r[::-1]:  # reverse buffer r
                        v_s_ = r + GAMMA * v_s_
                        buffer_v_target.append(v_s_)
                    buffer_v_target.reverse()

                    buffer_s, buffer_a, buffer_v_target = np.vstack(
                        buffer_s), np.array(buffer_a), np.vstack(
                            buffer_v_target)

                    feed_dict = {
                        self.AC.s: buffer_s,
                        self.AC.a_his: buffer_a,
                        self.AC.v_target: buffer_v_target,
                        self.AC.init_state: keep_state,
                    }
                    self.AC.update_global(feed_dict)

                    buffer_s, buffer_a, buffer_r = [], [], []
                    self.AC.pull_global()
                    keep_state = rnn_state_.copy(
                    )  # replace the keep_state as the new initial rnn state_

                s = s_
                rnn_state = rnn_state_  # renew rnn state
                total_step += 1
                if done:
                    if len(GLOBAL_RUNNING_R
                           ) == 0:  # record running episode reward
                        GLOBAL_RUNNING_R.append(ep_r)
                    else:
                        GLOBAL_RUNNING_R.append(0.99 * GLOBAL_RUNNING_R[-1] +
                                                0.01 * ep_r)
                    print(
                        self.name,
                        "Ep:",
                        GLOBAL_EP,
                        "| Ep_r: %i" % GLOBAL_RUNNING_R[-1],
                    )
                    GLOBAL_EP += 1
                    break

                if int(total_step / UPDATE_GLOBAL_ITER) % 100 == 0 and int(
                        total_step /
                        UPDATE_GLOBAL_ITER) > 0:  # !!!!! have been modified!!
                    print('model %d saved' %
                          (int(total_step / UPDATE_GLOBAL_ITER)))
                    saver.save(
                        SESS,
                        os.path.join(
                            weight_path, 'model_%d.ckpt' %
                            (int(total_step / UPDATE_GLOBAL_ITER))))
예제 #4
0
MAX_EP_STEP = 20
MAX_GLOBAL_EP = 1000
GLOBAL_NET_SCOPE = 'Global_Net'
UPDATE_GLOBAL_ITER = 16
GAMMA = 0.95
ENTROPY_BETA = 0.001
LR_A = 0.0001  # learning rate for actor
LR_C = 0.0001  # learning rate for critic
GLOBAL_RUNNING_R = []
GLOBAL_EP = 0

map_size = 64
obj_num = 25
action_type = 5

env = ENV_scene_new_action_pre_state_penalty_conflict_heuristic_transpose_shape_poly(
    size=(map_size, map_size), max_num=obj_num)
N_S = [map_size, map_size, 2]
N_A = action_type * obj_num

tensorboard_path = "tensorboard/20201022/"
weight_path = "weights_20201022/"

if not os.path.exists(tensorboard_path):
    os.makedirs(tensorboard_path)

if not os.path.exists(weight_path):
    os.makedirs(weight_path)


class ACNet(object):
    def __init__(self, scope, globalAC=None):
예제 #5
0
 def __init__(self, wid):
     self.wid = wid
     self.env = ENV_scene_new_action_pre_state_penalty_conflict_heuristic_transpose_shape_poly(
         size=(map_size, map_size), max_num=obj_num)
     self.ppo = GLOBAL_PPO
예제 #6
0
class Worker(object):
    def __init__(self, wid):
        self.wid = wid
        self.env = ENV_scene_new_action_pre_state_penalty_conflict_heuristic_transpose_shape_poly(
            size=(map_size, map_size), max_num=obj_num)
        self.ppo = GLOBAL_PPO

    def work(self):
        global GLOBAL_EP, GLOBAL_RUNNING_R, GLOBAL_UPDATE_COUNTER
        while not COORD.should_stop():
            #s = self.env.reset()

            current_num = np.random.randint(obj_num) + 1
            self.env.randominit_crowded(current_num)
            s = self.env.getstate_3()
            rnn_state = sess.run(
                self.ppo.init_state)  # zero rnn state at beginning
            keep_state = rnn_state.copy(
            )  # keep rnn state for updating global net
            ep_r = 0
            buffer_s, buffer_a, buffer_r = [], [], []

            for t in range(EP_LEN):
                if not ROLLING_EVENT.is_set():  # while global PPO is updating
                    ROLLING_EVENT.wait()  # wait until PPO is updated
                    buffer_s, buffer_a, buffer_r = [], [], [
                    ]  # clear history buffer, use new policy to collect data
                a, rnn_state_ = self.ppo.choose_action(s, rnn_state)

                #s_, r, done, _ = self.env.step(a)
                choice_index = int(a / action_type)
                choice_action = a % action_type
                r, done = self.env.move(choice_index, choice_action)
                s_ = self.env.getstate_3()

                if done: r = -10
                s_stack = s.reshape(-1)
                buffer_s.append(s_stack)
                buffer_a.append(a)
                buffer_r.append(
                    r - 1)  # 0 for not down, -11 for down. Reward engineering
                s = s_
                rnn_state = rnn_state_
                ep_r += r

                GLOBAL_UPDATE_COUNTER += 1  # count to minimum batch size, no need to wait other workers
                if t == EP_LEN - 1 or GLOBAL_UPDATE_COUNTER >= MIN_BATCH_SIZE or done:
                    if done:
                        v_s_ = 0  # end of episode
                    else:
                        v_s_ = self.ppo.get_v(s_, rnn_state_)

                    discounted_r = []  # compute discounted reward
                    for r in buffer_r[::-1]:
                        v_s_ = r + GAMMA * v_s_
                        discounted_r.append(v_s_)
                    discounted_r.reverse()

                    bs, ba, br = np.vstack(buffer_s), np.vstack(
                        buffer_a), np.array(discounted_r)[:, None]
                    buffer_s, buffer_a, buffer_r = [], [], []

                    QUEUE.put(np.hstack((bs, ba, br)))  # put data in the queue
                    if GLOBAL_UPDATE_COUNTER >= MIN_BATCH_SIZE:
                        ROLLING_EVENT.clear()  # stop collecting data
                        UPDATE_EVENT.set()  # globalPPO update

                    if GLOBAL_EP >= EP_MAX:  # stop training
                        COORD.request_stop()
                        break

                    if done: break
            if int(GLOBAL_EP / UPDATE_STEP) % 10 == 0 and int(
                    GLOBAL_EP / UPDATE_STEP) > 0:  # !!!!! have been modified!!
                print('model %d saved' % (int(GLOBAL_EP / UPDATE_STEP)))
                saver.save(
                    sess,
                    os.path.join(
                        weight_path,
                        'model_%d.ckpt' % (int(GLOBAL_EP / UPDATE_STEP))))

            # record reward changes, plot later
            if len(GLOBAL_RUNNING_R) == 0:
                GLOBAL_RUNNING_R.append(ep_r)
            else:
                GLOBAL_RUNNING_R.append(GLOBAL_RUNNING_R[-1] * 0.9 +
                                        ep_r * 0.1)
            GLOBAL_EP += 1
            print(
                '{0:.1f}%'.format(GLOBAL_EP / EP_MAX * 100),
                '|W%i' % self.wid,
                '|Ep_r: %.2f' % ep_r,
            )
예제 #7
0
 def __init__(self):
     self.env = ENV_scene_new_action_pre_state_penalty_conflict_heuristic_transpose_shape_poly(
         size=(map_size, map_size), max_num=obj_num)
     self.reset()