def run_DQfD(index, env): with tf.variable_scope('DQfD_' + str(index)): agent = DQfDDDQN(env, DQfDConfig()) with open(Config.DEMO_DATA_PATH, 'rb') as f: agent.demo_buffer = pickle.load(f) agent.pre_train() # use the demo data to pre-train network scores = [] for e in range(Config.episode): done = False score = 0 # sum of reward in one episode state = env.reset() while done is False: action = agent.egreedy_action(state) # e-greedy action for train next_state, reward, done, _ = env.step(action) score += reward reward = reward if not done or score == 499 else -100 agent.perceive(state, action, reward, next_state, done, 0.0) agent.train_Q_network(pre_train=False, update=False) state = next_state if done: scores.append(score) agent.sess.run(agent.update_target_net) print("episode:", e, " score:", score, " memory length:", len(agent.replay_buffer), " epsilon:", agent.epsilon) # if np.mean(scores[-min(10, len(scores)):]) > 495: # break return scores
def run_DQfD(index, env): with open(Config.DEMO_DATA_PATH, 'rb') as f: demo_transitions = pickle.load(f) demo_transitions = deque(itertools.islice(demo_transitions, 0, Config.demo_buffer_size)) assert len(demo_transitions) == Config.demo_buffer_size with tf.variable_scope('DQfD_' + str(index)): agent = DQfD(env, DQfDConfig(), demo_transitions=demo_transitions) agent.pre_train() # use the demo data to pre-train network scores, e, replay_full_episode = [], 0, None while True: done, score, n_step_reward, state = False, 0, None, env.reset() t_q = deque(maxlen=Config.trajectory_n) while done is False: action = agent.egreedy_action(state) # e-greedy action for train next_state, reward, done, _ = env.step(action) score += reward reward = reward if not done or score == 499 else -100 reward_to_sub = 0. if len(t_q) < t_q.maxlen else t_q[0][2] # record the earliest reward for the sub t_q.append([state, action, reward, next_state, done, 0.0]) if len(t_q) == t_q.maxlen: if n_step_reward is None: # only compute once when t_q first filled n_step_reward = sum([t[2]*Config.GAMMA**i for i, t in enumerate(t_q)]) else: n_step_reward = (n_step_reward - reward_to_sub) / Config.GAMMA n_step_reward += reward*Config.GAMMA**(Config.trajectory_n-1) t_q[0].extend([n_step_reward, next_state, done, t_q.maxlen]) # actual_n is max_len here agent.perceive(t_q[0]) # perceive when a transition is completed if agent.replay_memory.full(): agent.train_Q_network(update=False) # train along with generation replay_full_episode = replay_full_episode or e state = next_state if done: # handle transitions left in t_q t_q.popleft() # first transition's n-step is already set transitions = set_n_step(t_q, Config.trajectory_n) for t in transitions: agent.perceive(t) if agent.replay_memory.full(): agent.train_Q_network(update=False) replay_full_episode = replay_full_episode or e if agent.replay_memory.full(): scores.append(score) agent.sess.run(agent.update_target_net) if replay_full_episode is not None: print("episode: {} trained-episode: {} score: {} memory length: {} epsilon: {}" .format(e, e-replay_full_episode, score, len(agent.replay_memory), agent.epsilon)) # if np.mean(scores[-min(10, len(scores)):]) > 495: # break # agent.save_model() if len(scores) >= Config.episode: break e += 1 return scores
def run_DQfD(index, env, file_demo, file_name): with open(file_demo + 'demo.p', 'rb') as f: demo_transitions = pickle.load(f) demo_transitions = deque( itertools.islice(demo_transitions, 0, DQfDConfig.demo_buffer_size)) assert len(demo_transitions) == DQfDConfig.demo_buffer_size with tf.variable_scope('DQfD_' + str(index)): agent = DQfD(env, DQfDConfig(), demo_transitions=demo_transitions) agent.pre_train() # use the demo data to pre-train network REWARDS, REWARD100, episode, replay_full_episode = [], [], 0, None reward100, n_step_reward, state = 0, None, env.reset() state = trans_state(state) t_q = deque(maxlen=DQfDConfig.trajectory_n) for steps in range(DQfDConfig.episode): action = agent.egreedy_action(state) # e-greedy action for train next_state, reward, done, _ = env.step(action) next_state = trans_state(next_state) reward100 += reward REWARDS.append(reward) t_q.append([state, action, reward, next_state, done, 0.0]) # record the earliest reward for the sub-sequence if len(t_q) < t_q.maxlen: reward_to_sub = 0. else: reward_to_sub = t_q[0][2] if n_step_reward is None: # only compute once when t_q first filled n_step_reward = sum( [t[2] * DQfDConfig.GAMMA**i for i, t in enumerate(t_q)]) else: n_step_reward = (n_step_reward - reward_to_sub) / DQfDConfig.GAMMA n_step_reward += reward * DQfDConfig.GAMMA**( DQfDConfig.trajectory_n - 1) t_q[0].extend([n_step_reward, next_state, done, t_q.maxlen]) # actual_n is max_len here update_eps = True if (steps + 1) % DQfDConfig.eps_gap == 0 else False agent.perceive(t_q[0], update_eps=update_eps ) # perceive when a transition is completed if (steps + 1) % DQfDConfig.UPDATE_ESTIMATE_NET == 0: agent.train_Q_network( update=False) # train along with generation replay_full_episode = replay_full_episode or episode state = next_state if (steps + 1) % DQfDConfig.UPDATE_TARGET_NET == 0: if agent.replay_memory.full(): agent.sess.run(agent.update_target_net) if (steps + 1) % DQfDConfig.eps_gap == 0: episode += 1 if replay_full_episode is not None: print( "episode: {} trained-episode: {} reward100: {} memory length: {} epsilon: {}" .format(episode, episode - replay_full_episode, reward100, len(agent.replay_memory), agent.epsilon)) REWARD100.append(reward100) reward100 = 0 if (steps + 1) % (DQfDConfig.eps_gap * 100) == 0: with open(file_name + 'REWARD100.p', 'wb') as f: pickle.dump(REWARD100, f, protocol=2) with open(file_name + 'REWARD100.txt', 'wb') as f: f.write(str(REWARD100)) with open(file_name + 'REWARDS.p', 'wb') as f: pickle.dump(REWARDS, f, protocol=2) with open(file_name + 'REWARDS.txt', 'wb') as f: f.write(str(REWARDS)) plot(1, REWARDS, file_name) with open(file_name + 'REWARD100.p', 'wb') as f: pickle.dump(REWARD100, f, protocol=2) with open(file_name + 'REWARD100.txt', 'wb') as f: f.write(str(REWARD100)) with open(file_name + 'REWARDS.p', 'wb') as f: pickle.dump(REWARDS, f, protocol=2) with open(file_name + 'REWARDS.txt', 'wb') as f: f.write(str(REWARDS)) plot(1, REWARDS, file_name)
coord = tf.train.Coordinator() scores, e, replay_full_episode = [], 0, None gameName = Config.GAME_NAME gameID = Config.ENV_NAME dataSetAction = Config.ACTION_SET env = gym.make(gameID) gymAction = env.unwrapped.get_action_meanings() actionTranslator = actionTranslate(gymAction, dataSetAction) episodeList = os.listdir(Config().SCREEN_PATH + gameName + '/') # dir is your directory path screenpath = Config.SCREEN_PATH trajpath = Config.TRAJ_PATH agent = DQfD('learner', env, DQfDConfig(), session, replayMemory) env = gym.make(Config.ENV_NAME) trainer = Trainer('leanner', env, agent, episodeList) trainer.run() #threads = [] #agent = DQfD('learner', env, DQfDConfig(), session, replayMemory) ##env = gym.make(Config.ENV_NAME) #local = DQfD('actor0', env, DQfDConfig(), session, replayMemory) #actor = Actor('actor0' + 0, env, agent, local) #actor.run() #with tf.device('/gpu:0'): # agent = DQfD('learner', env, DQfDConfig(), session, replayMemory) # learner = Learner('learner', agent) # #learner.run() # #print(agent.getSelectNet()) # learn = lambda: learner.run()
def run_DQfD(index, env): with open(Config.DEMO_DATA_PATH, 'rb') as f: demo_transitions = pickle.load(f) demo_transitions = deque( itertools.islice(demo_transitions, 0, Config.demo_buffer_size)) assert len(demo_transitions) == Config.demo_buffer_size with tf.variable_scope('DQfD_' + str(index)): agent = DQfD(env, DQfDConfig(), demo_transitions=demo_transitions) agent.pre_train() # use the demo data to pre-train network scores = [] for e in range(Config.episode): done = False score = 0 # sum of reward in one episode state = env.reset() t_q = deque(maxlen=Config.trajectory_n) n_step_reward = None transitions = [] while done is False: action = agent.egreedy_action(state) # e-greedy action for train next_state, reward, done, _ = env.step(action) score += reward reward = reward if not done or score == 499 else -100 reward_to_sub = 0. if len(t_q) == 0 else t_q[0][ 2] # record the earliest reward for the sub t_q.append([state, action, reward, next_state, done, 0.0]) if len(t_q) == t_q.maxlen: if n_step_reward is None: n_step_reward = sum([ t[2] * Config.GAMMA**i for i, t in enumerate(t_q) ]) # only compute once when t_q is full else: n_step_reward = ( n_step_reward - reward_to_sub ) / Config.GAMMA + reward * Config.GAMMA**( Config.trajectory_n - 1) t_q[0].extend([ n_step_reward, next_state, done, t_q.maxlen ]) # [n_step_r, n_step_s, n_step_done, actual_n] # t_q[0].extend([n_step_reward, next_state, done]) # [n_step_r, n_step_s, n_step_done, actual_n] # assert len(t_q[0]) == 10 agent.perceive( t_q[0]) # perceive when a transition is completed agent.train_Q_network( update=False ) # diff from generate demo data: trainning should go along with generation state = next_state if done: # handle transitions in t_q t_q.popleft() # first transition's n-step is already set transitions = set_n_step(t_q) for t in transitions: agent.perceive(t) agent.train_Q_network(update=False) scores.append(score) agent.sess.run(agent.update_target_net) print("episode: {} score: {} memory length: {} epsilon: {}". format(e, score, len(agent.replay_memory), agent.epsilon)) # if np.mean(scores[-min(10, len(scores)):]) > 495: # break agent.save_model() return scores
def _train(dic_exp_conf, dic_agent_conf, dic_traffic_env_conf, dic_path): random.seed(dic_agent_conf['SEED']) np.random.seed(dic_agent_conf['SEED']) tf.set_random_seed(dic_agent_conf['SEED']) dic_path["PATH_TO_LOG"] = os.path.join(dic_path['PATH_TO_WORK_DIRECTORY'], "train_round") # # dic_path['PATH_TO_SUMO_CONF'] = os.path.join(dic_path['PATH_TO_WORK_DIRECTORY'], "sumo_conf", task) if not os.path.exists(dic_path['PATH_TO_LOG']): os.makedirs(dic_path['PATH_TO_LOG']) # dic_exp_conf = copy.deepcopy(self.dic_exp_conf) if dic_traffic_env_conf['SIMULATOR_TYPE'] == 'sumo': path_to_work_directory = dic_path["PATH_TO_SUMO_CONF"] env = DIC_ENVS[dic_traffic_env_conf["SIMULATOR_TYPE"]]( path_to_log=dic_path["PATH_TO_LOG"], path_to_work_directory=path_to_work_directory, dic_traffic_env_conf=dic_traffic_env_conf) elif dic_traffic_env_conf['SIMULATOR_TYPE'] == 'anon': env = DIC_ENVS[dic_traffic_env_conf["SIMULATOR_TYPE"]]( path_to_log=dic_path["PATH_TO_LOG"], path_to_work_directory=dic_path["PATH_TO_DATA"], dic_traffic_env_conf=dic_traffic_env_conf) dic_agent_conf["PHI"] = 5 dic_agent_conf["MIN_GREEN_VEC"] = 3 dic_agent_conf["MAX_RED_VEC"] = 6 demo_path = "../frap/demo_{}.p".format(dic_exp_conf["TRAFFIC_FILE"]) with open(demo_path, 'rb') as f: demo_transitions = pickle.load(f) demo_transitions = deque( itertools.islice(demo_transitions, 0, Config.demo_buffer_size)) sess = tf.InteractiveSession() actor = Actor(sess=sess, n_features=16, n_actions=8, dic_traffic_env_conf=dic_traffic_env_conf, lr=1e-3) # 初始化Actor critic = Critic(sess=sess, n_features=16, config=DQfDConfig(), dic_traffic_env_conf=dic_traffic_env_conf, demo=demo_transitions, lr=1e-3) # 初始化Critic # agent = Agent(sess=sess, n_features=16, config=DQfDConfig(), dic_traffic_env_conf=dic_traffic_env_conf, # demo=demo_transitions, lr=1e-3) # actor = Actor(sess=sess, n_features=16, n_actions=8, dic_traffic_env_conf=dic_traffic_env_conf, lr=1e-3) # critic = Critic(sess=sess, n_features=16, lr=1e-3) sess.run(tf.global_variables_initializer()) # 初始化参数 for i in range(10): state, action = critic.train_Q_network(pre_train=True) actor.pretrain(state, action) for i in range(501): done = False state = env.reset() step_num = 0 while not done and step_num < int( dic_exp_conf["EPISODE_LEN"] / dic_traffic_env_conf["MIN_ACTION_TIME"]): action_list = [] for one_state in state: s = convert_to_input(one_state, dic_traffic_env_conf) action, probs = actor.choose_action( s) # one for multi-state, the other for multi-intersection # action, probs = agent.choose_action(s) action_list.append(action) # for multi-state next_state, reward, done, ave_reward = env.step(action_list) s = convert_to_input(state[0], dic_traffic_env_conf) s_ = convert_to_input(next_state[0], dic_traffic_env_conf) next_action, _ = actor.choose_action(s_) # next_action, _ = agent.choose_action(s_) # # # q_a = critic.learn(s, np.array(ave_reward), s_, np.array([action]), np.array([next_action]), probs) # q_a = critic.learn(s, np.array(ave_reward), s_, np.array([action]), np.array([next_action])) if i != 0: q_a = critic.learn(s, np.array(ave_reward), s_, np.array([action]), np.array([next_action]), probs) # q_a = critic.learn(s, np.array(reward), s_) actor.learn(s, np.array([action]), q_a) # agent.learn_actor(s, np.array([action]), q_a) state = next_state step_num += 1 # if i % 3 == 0 and i != 0: # critic.sess.run(critic.update_target_net) env.bulk_log(i) write_summary(dic_path, dic_exp_conf, i)
def run_DQfD(index, env): with tf.variable_scope('DQfD_' + str(index)): agent = DQfDDDQN(env, DQfDConfig()) # load the expert data for demonstration dq1 = deque() dq2 = deque() dq3 = deque() fav1 = pickle.load(open(Config.DEMO_DATA_PATH_1, "rb")) fav2 = pickle.load(open(Config.DEMO_DATA_PATH_2, "rb")) fav3 = pickle.load(open(Config.DEMO_DATA_PATH_3, "rb")) dq1 = fav1 # converting AC dict to a deque class state = [] next_state = [] action = [] reward = [] done = [] expert = [] demo = [] for i in range(len(fav2['observations'])): state.append(fav2['observations'][i]) action.append(fav2['actions'][i]) reward.append(fav2['rewards'][i]) next_state.append(fav2['next_observations'][i]) done.append(fav2['done'][i]) expert.append(fav2['expert'][i]) for i in range(len(state)): demo.append((state[i], action[i], reward[i], next_state[i], done[i], expert[i])) dq2.extend(demo) # converting PG dict to a deque class state1 = [] next_state1 = [] action1 = [] reward1 = [] done1 = [] expert1 = [] demo1 = [] for i in range(len(fav3['observations'])): state1.append(fav3['observations'][i]) action1.append(fav3['actions'][i]) reward1.append(fav3['rewards'][i]) next_state1.append(fav3['next_observations'][i]) done1.append(fav3['done'][i]) expert1.append(fav3['expert'][i]) for i in range(len(state1)): demo1.append((state1[i], action1[i], reward1[i], next_state1[i], done1[i], expert1[i])) dq3.extend(demo1) # append all the experts data to demo_buffer of the agent agent.demo_buffer.extend(dq1) agent.demo_buffer.extend(dq2) agent.demo_buffer.extend(dq3) # shuffle the demo_buffer before pre-training random.shuffle(agent.demo_buffer) # use the demo data to pre-train network agent.pre_train() scores = [] for e in range(Config.episode): done = False score = 0 # sum of reward in one episode state = env.reset() while done is False: action = agent.egreedy_action(state) # e-greedy action for train next_state, reward, done, _ = env.step(action) score += reward reward = reward if not done or score == 499 else -100 agent.perceive(state, action, reward, next_state, done, 0.0) agent.train_Q_network(pre_train=False, update=False) state = next_state if done: scores.append(score) agent.sess.run(agent.update_target_net) print("episode:", e, " score:", score, " memory length:", len(agent.replay_buffer), " epsilon:", agent.epsilon) # if np.mean(scores[-min(10, len(scores)):]) > 495: # break return scores