def run_DQfD(index, env):
    with tf.variable_scope('DQfD_' + str(index)):
        agent = DQfDDDQN(env, DQfDConfig())
    with open(Config.DEMO_DATA_PATH, 'rb') as f:
        agent.demo_buffer = pickle.load(f)
    agent.pre_train()  # use the demo data to pre-train network
    scores = []
    for e in range(Config.episode):
        done = False
        score = 0  # sum of reward in one episode
        state = env.reset()
        while done is False:
            action = agent.egreedy_action(state)  # e-greedy action for train
            next_state, reward, done, _ = env.step(action)
            score += reward
            reward = reward if not done or score == 499 else -100
            agent.perceive(state, action, reward, next_state, done, 0.0)
            agent.train_Q_network(pre_train=False, update=False)
            state = next_state
        if done:
            scores.append(score)
            agent.sess.run(agent.update_target_net)
            print("episode:", e, "  score:", score, "  memory length:",
                  len(agent.replay_buffer), "  epsilon:", agent.epsilon)
            # if np.mean(scores[-min(10, len(scores)):]) > 495:
            #     break
    return scores
Пример #2
0
def run_DQfD(index, env):
    with open(Config.DEMO_DATA_PATH, 'rb') as f:
        demo_transitions = pickle.load(f)
        demo_transitions = deque(itertools.islice(demo_transitions, 0, Config.demo_buffer_size))
        assert len(demo_transitions) == Config.demo_buffer_size
    with tf.variable_scope('DQfD_' + str(index)):
        agent = DQfD(env, DQfDConfig(), demo_transitions=demo_transitions)

    agent.pre_train()  # use the demo data to pre-train network
    scores, e, replay_full_episode = [], 0, None
    while True:
        done, score, n_step_reward, state = False, 0, None, env.reset()
        t_q = deque(maxlen=Config.trajectory_n)
        while done is False:
            action = agent.egreedy_action(state)  # e-greedy action for train
            next_state, reward, done, _ = env.step(action)
            score += reward
            reward = reward if not done or score == 499 else -100
            reward_to_sub = 0. if len(t_q) < t_q.maxlen else t_q[0][2]  # record the earliest reward for the sub
            t_q.append([state, action, reward, next_state, done, 0.0])
            if len(t_q) == t_q.maxlen:
                if n_step_reward is None:  # only compute once when t_q first filled
                    n_step_reward = sum([t[2]*Config.GAMMA**i for i, t in enumerate(t_q)])
                else:
                    n_step_reward = (n_step_reward - reward_to_sub) / Config.GAMMA
                    n_step_reward += reward*Config.GAMMA**(Config.trajectory_n-1)
                t_q[0].extend([n_step_reward, next_state, done, t_q.maxlen])  # actual_n is max_len here
                agent.perceive(t_q[0])  # perceive when a transition is completed
                if agent.replay_memory.full():
                    agent.train_Q_network(update=False)  # train along with generation
                    replay_full_episode = replay_full_episode or e
            state = next_state
        if done:
            # handle transitions left in t_q
            t_q.popleft()  # first transition's n-step is already set
            transitions = set_n_step(t_q, Config.trajectory_n)
            for t in transitions:
                agent.perceive(t)
                if agent.replay_memory.full():
                    agent.train_Q_network(update=False)
                    replay_full_episode = replay_full_episode or e
            if agent.replay_memory.full():
                scores.append(score)
                agent.sess.run(agent.update_target_net)
            if replay_full_episode is not None:
                print("episode: {}  trained-episode: {}  score: {}  memory length: {}  epsilon: {}"
                      .format(e, e-replay_full_episode, score, len(agent.replay_memory), agent.epsilon))
            # if np.mean(scores[-min(10, len(scores)):]) > 495:
            #     break
            # agent.save_model()
        if len(scores) >= Config.episode:
            break
        e += 1
    return scores
Пример #3
0
def run_DQfD(index, env, file_demo, file_name):
    with open(file_demo + 'demo.p', 'rb') as f:
        demo_transitions = pickle.load(f)
        demo_transitions = deque(
            itertools.islice(demo_transitions, 0, DQfDConfig.demo_buffer_size))
        assert len(demo_transitions) == DQfDConfig.demo_buffer_size
    with tf.variable_scope('DQfD_' + str(index)):
        agent = DQfD(env, DQfDConfig(), demo_transitions=demo_transitions)

    agent.pre_train()  # use the demo data to pre-train network

    REWARDS, REWARD100, episode, replay_full_episode = [], [], 0, None
    reward100, n_step_reward, state = 0, None, env.reset()
    state = trans_state(state)
    t_q = deque(maxlen=DQfDConfig.trajectory_n)
    for steps in range(DQfDConfig.episode):
        action = agent.egreedy_action(state)  # e-greedy action for train
        next_state, reward, done, _ = env.step(action)
        next_state = trans_state(next_state)
        reward100 += reward
        REWARDS.append(reward)
        t_q.append([state, action, reward, next_state, done, 0.0])

        # record the earliest reward for the sub-sequence
        if len(t_q) < t_q.maxlen:
            reward_to_sub = 0.
        else:
            reward_to_sub = t_q[0][2]
            if n_step_reward is None:  # only compute once when t_q first filled
                n_step_reward = sum(
                    [t[2] * DQfDConfig.GAMMA**i for i, t in enumerate(t_q)])
            else:
                n_step_reward = (n_step_reward -
                                 reward_to_sub) / DQfDConfig.GAMMA
                n_step_reward += reward * DQfDConfig.GAMMA**(
                    DQfDConfig.trajectory_n - 1)

            t_q[0].extend([n_step_reward, next_state, done,
                           t_q.maxlen])  # actual_n is max_len here
            update_eps = True if (steps +
                                  1) % DQfDConfig.eps_gap == 0 else False
            agent.perceive(t_q[0], update_eps=update_eps
                           )  # perceive when a transition is completed
            if (steps + 1) % DQfDConfig.UPDATE_ESTIMATE_NET == 0:
                agent.train_Q_network(
                    update=False)  # train along with generation
            replay_full_episode = replay_full_episode or episode

        state = next_state

        if (steps + 1) % DQfDConfig.UPDATE_TARGET_NET == 0:
            if agent.replay_memory.full():
                agent.sess.run(agent.update_target_net)

        if (steps + 1) % DQfDConfig.eps_gap == 0:
            episode += 1
            if replay_full_episode is not None:
                print(
                    "episode: {}  trained-episode: {}  reward100: {}  memory length: {}  epsilon: {}"
                    .format(episode, episode - replay_full_episode, reward100,
                            len(agent.replay_memory), agent.epsilon))
            REWARD100.append(reward100)
            reward100 = 0

        if (steps + 1) % (DQfDConfig.eps_gap * 100) == 0:
            with open(file_name + 'REWARD100.p', 'wb') as f:
                pickle.dump(REWARD100, f, protocol=2)
            with open(file_name + 'REWARD100.txt', 'wb') as f:
                f.write(str(REWARD100))
            with open(file_name + 'REWARDS.p', 'wb') as f:
                pickle.dump(REWARDS, f, protocol=2)
            with open(file_name + 'REWARDS.txt', 'wb') as f:
                f.write(str(REWARDS))
            plot(1, REWARDS, file_name)

    with open(file_name + 'REWARD100.p', 'wb') as f:
        pickle.dump(REWARD100, f, protocol=2)
    with open(file_name + 'REWARD100.txt', 'wb') as f:
        f.write(str(REWARD100))
    with open(file_name + 'REWARDS.p', 'wb') as f:
        pickle.dump(REWARDS, f, protocol=2)
    with open(file_name + 'REWARDS.txt', 'wb') as f:
        f.write(str(REWARDS))
    plot(1, REWARDS, file_name)
Пример #4
0
    coord = tf.train.Coordinator()

    scores, e, replay_full_episode = [], 0, None
    gameName = Config.GAME_NAME
    gameID = Config.ENV_NAME
    dataSetAction = Config.ACTION_SET
    env = gym.make(gameID)
    gymAction = env.unwrapped.get_action_meanings()
    actionTranslator = actionTranslate(gymAction, dataSetAction)
    episodeList = os.listdir(Config().SCREEN_PATH + gameName +
                             '/')  # dir is your directory path

    screenpath = Config.SCREEN_PATH
    trajpath = Config.TRAJ_PATH

    agent = DQfD('learner', env, DQfDConfig(), session, replayMemory)
    env = gym.make(Config.ENV_NAME)
    trainer = Trainer('leanner', env, agent, episodeList)
    trainer.run()
    #threads = []
    #agent = DQfD('learner', env, DQfDConfig(), session, replayMemory)
    ##env = gym.make(Config.ENV_NAME)
    #local = DQfD('actor0', env, DQfDConfig(), session, replayMemory)
    #actor = Actor('actor0' + 0, env, agent, local)
    #actor.run()
    #with tf.device('/gpu:0'):
    #    agent = DQfD('learner', env, DQfDConfig(), session, replayMemory)
    #    learner = Learner('learner',  agent)
    #    #learner.run()
    #    #print(agent.getSelectNet())
    #    learn = lambda: learner.run()
Пример #5
0
def run_DQfD(index, env):
    with open(Config.DEMO_DATA_PATH, 'rb') as f:
        demo_transitions = pickle.load(f)
        demo_transitions = deque(
            itertools.islice(demo_transitions, 0, Config.demo_buffer_size))
        assert len(demo_transitions) == Config.demo_buffer_size
    with tf.variable_scope('DQfD_' + str(index)):
        agent = DQfD(env, DQfDConfig(), demo_transitions=demo_transitions)

    agent.pre_train()  # use the demo data to pre-train network
    scores = []
    for e in range(Config.episode):
        done = False
        score = 0  # sum of reward in one episode
        state = env.reset()
        t_q = deque(maxlen=Config.trajectory_n)
        n_step_reward = None
        transitions = []
        while done is False:
            action = agent.egreedy_action(state)  # e-greedy action for train
            next_state, reward, done, _ = env.step(action)
            score += reward
            reward = reward if not done or score == 499 else -100
            reward_to_sub = 0. if len(t_q) == 0 else t_q[0][
                2]  # record the earliest reward for the sub
            t_q.append([state, action, reward, next_state, done, 0.0])
            if len(t_q) == t_q.maxlen:
                if n_step_reward is None:
                    n_step_reward = sum([
                        t[2] * Config.GAMMA**i for i, t in enumerate(t_q)
                    ])  # only compute once when t_q is full
                else:
                    n_step_reward = (
                        n_step_reward - reward_to_sub
                    ) / Config.GAMMA + reward * Config.GAMMA**(
                        Config.trajectory_n - 1)
                t_q[0].extend([
                    n_step_reward, next_state, done, t_q.maxlen
                ])  # [n_step_r, n_step_s, n_step_done, actual_n]
                # t_q[0].extend([n_step_reward, next_state, done])  # [n_step_r, n_step_s, n_step_done, actual_n]
                # assert len(t_q[0]) == 10
                agent.perceive(
                    t_q[0])  # perceive when a transition is completed
                agent.train_Q_network(
                    update=False
                )  # diff from generate demo data: trainning should go along with generation
            state = next_state
        if done:
            # handle transitions in t_q
            t_q.popleft()  # first transition's n-step is already set
            transitions = set_n_step(t_q)
            for t in transitions:
                agent.perceive(t)
                agent.train_Q_network(update=False)
            scores.append(score)
            agent.sess.run(agent.update_target_net)
            print("episode: {}  score: {}  memory length: {}  epsilon: {}".
                  format(e, score, len(agent.replay_memory), agent.epsilon))
            # if np.mean(scores[-min(10, len(scores)):]) > 495:
            #     break
            agent.save_model()
    return scores
Пример #6
0
def _train(dic_exp_conf, dic_agent_conf, dic_traffic_env_conf, dic_path):
    random.seed(dic_agent_conf['SEED'])
    np.random.seed(dic_agent_conf['SEED'])
    tf.set_random_seed(dic_agent_conf['SEED'])

    dic_path["PATH_TO_LOG"] = os.path.join(dic_path['PATH_TO_WORK_DIRECTORY'],
                                           "train_round")
    #
    # dic_path['PATH_TO_SUMO_CONF'] = os.path.join(dic_path['PATH_TO_WORK_DIRECTORY'], "sumo_conf", task)

    if not os.path.exists(dic_path['PATH_TO_LOG']):
        os.makedirs(dic_path['PATH_TO_LOG'])

    # dic_exp_conf = copy.deepcopy(self.dic_exp_conf)

    if dic_traffic_env_conf['SIMULATOR_TYPE'] == 'sumo':
        path_to_work_directory = dic_path["PATH_TO_SUMO_CONF"]
        env = DIC_ENVS[dic_traffic_env_conf["SIMULATOR_TYPE"]](
            path_to_log=dic_path["PATH_TO_LOG"],
            path_to_work_directory=path_to_work_directory,
            dic_traffic_env_conf=dic_traffic_env_conf)

    elif dic_traffic_env_conf['SIMULATOR_TYPE'] == 'anon':
        env = DIC_ENVS[dic_traffic_env_conf["SIMULATOR_TYPE"]](
            path_to_log=dic_path["PATH_TO_LOG"],
            path_to_work_directory=dic_path["PATH_TO_DATA"],
            dic_traffic_env_conf=dic_traffic_env_conf)
    dic_agent_conf["PHI"] = 5
    dic_agent_conf["MIN_GREEN_VEC"] = 3
    dic_agent_conf["MAX_RED_VEC"] = 6
    demo_path = "../frap/demo_{}.p".format(dic_exp_conf["TRAFFIC_FILE"])

    with open(demo_path, 'rb') as f:
        demo_transitions = pickle.load(f)
        demo_transitions = deque(
            itertools.islice(demo_transitions, 0, Config.demo_buffer_size))

    sess = tf.InteractiveSession()
    actor = Actor(sess=sess,
                  n_features=16,
                  n_actions=8,
                  dic_traffic_env_conf=dic_traffic_env_conf,
                  lr=1e-3)  # 初始化Actor
    critic = Critic(sess=sess,
                    n_features=16,
                    config=DQfDConfig(),
                    dic_traffic_env_conf=dic_traffic_env_conf,
                    demo=demo_transitions,
                    lr=1e-3)  # 初始化Critic
    # agent = Agent(sess=sess, n_features=16, config=DQfDConfig(), dic_traffic_env_conf=dic_traffic_env_conf,
    #               demo=demo_transitions, lr=1e-3)
    # actor = Actor(sess=sess, n_features=16, n_actions=8, dic_traffic_env_conf=dic_traffic_env_conf, lr=1e-3)
    # critic = Critic(sess=sess, n_features=16, lr=1e-3)
    sess.run(tf.global_variables_initializer())  # 初始化参数

    for i in range(10):
        state, action = critic.train_Q_network(pre_train=True)
        actor.pretrain(state, action)

    for i in range(501):

        done = False
        state = env.reset()
        step_num = 0
        while not done and step_num < int(
                dic_exp_conf["EPISODE_LEN"] /
                dic_traffic_env_conf["MIN_ACTION_TIME"]):
            action_list = []
            for one_state in state:
                s = convert_to_input(one_state, dic_traffic_env_conf)
                action, probs = actor.choose_action(
                    s)  # one for multi-state, the other for multi-intersection
                # action, probs = agent.choose_action(s)
                action_list.append(action)  # for multi-state

            next_state, reward, done, ave_reward = env.step(action_list)

            s = convert_to_input(state[0], dic_traffic_env_conf)
            s_ = convert_to_input(next_state[0], dic_traffic_env_conf)

            next_action, _ = actor.choose_action(s_)
            # next_action, _ = agent.choose_action(s_)
            # #
            # q_a = critic.learn(s, np.array(ave_reward), s_, np.array([action]), np.array([next_action]), probs)
            # q_a = critic.learn(s, np.array(ave_reward), s_, np.array([action]), np.array([next_action]))
            if i != 0:
                q_a = critic.learn(s, np.array(ave_reward), s_,
                                   np.array([action]), np.array([next_action]),
                                   probs)
                # q_a = critic.learn(s, np.array(reward), s_)
                actor.learn(s, np.array([action]), q_a)
            # agent.learn_actor(s, np.array([action]), q_a)

            state = next_state
            step_num += 1
        # if i % 3 == 0 and i != 0:
        #     critic.sess.run(critic.update_target_net)
        env.bulk_log(i)
        write_summary(dic_path, dic_exp_conf, i)
Пример #7
0
def run_DQfD(index, env):
    with tf.variable_scope('DQfD_' + str(index)):
        agent = DQfDDDQN(env, DQfDConfig())
    # load the expert data for demonstration
    dq1 = deque()
    dq2 = deque()
    dq3 = deque()
    fav1 = pickle.load(open(Config.DEMO_DATA_PATH_1, "rb"))
    fav2 = pickle.load(open(Config.DEMO_DATA_PATH_2, "rb"))
    fav3 = pickle.load(open(Config.DEMO_DATA_PATH_3, "rb"))
    dq1 = fav1

    # converting AC dict to a deque class
    state = []
    next_state = []
    action = []
    reward = []
    done = []
    expert = []
    demo = []
    for i in range(len(fav2['observations'])):
        state.append(fav2['observations'][i])
        action.append(fav2['actions'][i])
        reward.append(fav2['rewards'][i])
        next_state.append(fav2['next_observations'][i])
        done.append(fav2['done'][i])
        expert.append(fav2['expert'][i])
    for i in range(len(state)):
        demo.append((state[i], action[i], reward[i], next_state[i], done[i],
                     expert[i]))
    dq2.extend(demo)

    # converting PG dict to a deque class
    state1 = []
    next_state1 = []
    action1 = []
    reward1 = []
    done1 = []
    expert1 = []
    demo1 = []
    for i in range(len(fav3['observations'])):
        state1.append(fav3['observations'][i])
        action1.append(fav3['actions'][i])
        reward1.append(fav3['rewards'][i])
        next_state1.append(fav3['next_observations'][i])
        done1.append(fav3['done'][i])
        expert1.append(fav3['expert'][i])
    for i in range(len(state1)):
        demo1.append((state1[i], action1[i], reward1[i], next_state1[i],
                      done1[i], expert1[i]))
    dq3.extend(demo1)

    # append all the experts data to demo_buffer of the agent
    agent.demo_buffer.extend(dq1)
    agent.demo_buffer.extend(dq2)
    agent.demo_buffer.extend(dq3)

    # shuffle the demo_buffer before pre-training
    random.shuffle(agent.demo_buffer)

    # use the demo data to pre-train network
    agent.pre_train()

    scores = []
    for e in range(Config.episode):
        done = False
        score = 0  # sum of reward in one episode
        state = env.reset()
        while done is False:
            action = agent.egreedy_action(state)  # e-greedy action for train
            next_state, reward, done, _ = env.step(action)
            score += reward
            reward = reward if not done or score == 499 else -100
            agent.perceive(state, action, reward, next_state, done, 0.0)
            agent.train_Q_network(pre_train=False, update=False)
            state = next_state
        if done:
            scores.append(score)
            agent.sess.run(agent.update_target_net)
            print("episode:", e, "  score:", score, "  memory length:",
                  len(agent.replay_buffer), "  epsilon:", agent.epsilon)
            # if np.mean(scores[-min(10, len(scores)):]) > 495:
            #     break
    return scores