critic = Critic(sess, n_features=env.observation_space.shape[0], learning_rate=0.01) sess.run(tf.global_variables_initializer()) tf.summary.FileWriter("logs/", sess.graph) for i_episode in range(MAX_EPISODE): observation = env.reset() t = 0 track_reward = [] while True: if RENDER: env.render() action = actor.choose_action(observation) observation_, reward, done, info = env.step(action) if done: reward = -20 track_reward.append(reward) td_error = critic.learn(observation, reward, observation_) actor.learn(observation, action, td_error) observation = observation_ t += 1 if done or t > MAX_EP_STEP: ep_rs_sum = sum(track_reward)
Action_dim = 4 sess = tf.Session() actor = Actor(sess, State_dim=State_dim, Action_dim=Action_dim, lr=Actor_lr) critic = Critic(sess, State_dim=State_dim, lr=Critic_lr) sess.run(tf.global_variables_initializer()) for i_episode in range(MAX_EPISODE): s = env.reset() t = 0 track_r = 0 total_action = [] done = False while (not done and t < 200): a = actor.choose_action(s) s_, r, done = env.step(env.t_action[a]) total_action.append(env.t_action[a]) if done: r = -200 td_error = critic.learn(s, -r, s_) actor.learn(s, a, td_error) s = s_ track_r += r t += 1 print("episode:", i_episode, " tracked actions to attempt goal:", total_action)
class ReinforcementLearner: """ Reinforcement Learner agent using the Actor-Critic architecture ... Attributes ---------- Methods ------- run(): Runs all episodes with pivotal parameters """ def __init__(self): self.__actor = Actor( parameters.ACTOR_LEARNING_RATE, parameters.ACTOR_DISCOUNT_FACTOR, parameters.ACTOR_TRACE_DECAY, parameters.ACTOR_EPSILON, parameters.ACTOR_EPSILON_DECAY, ) self.__critic = CriticFactory.get_critic( parameters.USE_TABLE_CRITIC, parameters.CRITIC_LEARNING_RATE, parameters.CRITIC_DISCOUNT_FACTOR, parameters.CRITIC_TRACE_DECAY, parameters.CRITIC_NN_DIMENSIONS, ) self.__simulated_world = SimulatedWorld() self.__episodes = parameters.EPISODES def __run_one_episode(self, visualize: bool = False) -> None: self.__actor.reset_eligibilities() self.__critic.reset_eligibilities() state, possible_actions = self.__simulated_world.reset() action = self.__actor.choose_action(state, possible_actions) done = False while not done: next_state, reward, done, possible_actions = self.__simulated_world.step( action, visualize) next_action = self.__actor.choose_action(next_state, possible_actions) self.__actor.replace_eligibilities(state, action) self.__critic.replace_eligibilities(state) td_error = self.__critic.td_error(reward, next_state, state) self.__critic.update(reward, next_state, state) self.__actor.update(td_error) state, action = next_state, next_action def run(self) -> None: """ Runs all episodes with pivotal parameters. Visualizes one round at the end. """ for episode in range(self.__episodes): print('Episode:', episode + 1) self.__run_one_episode() print('Training completed.') self.__actor.plot_training_data() self.__critic.plot_training_data() self.__simulated_world.plot_training_data() if parameters.VISUALIZE_GAMES: print('Showing one episode with the greedy strategy.') self.__actor.set_epsilon(0) self.__run_one_episode(True)
def _train(dic_exp_conf, dic_agent_conf, dic_traffic_env_conf, dic_path): random.seed(dic_agent_conf['SEED']) np.random.seed(dic_agent_conf['SEED']) tf.set_random_seed(dic_agent_conf['SEED']) dic_path["PATH_TO_LOG"] = os.path.join(dic_path['PATH_TO_WORK_DIRECTORY'], "train_round") # # dic_path['PATH_TO_SUMO_CONF'] = os.path.join(dic_path['PATH_TO_WORK_DIRECTORY'], "sumo_conf", task) if not os.path.exists(dic_path['PATH_TO_LOG']): os.makedirs(dic_path['PATH_TO_LOG']) # dic_exp_conf = copy.deepcopy(self.dic_exp_conf) if dic_traffic_env_conf['SIMULATOR_TYPE'] == 'sumo': path_to_work_directory = dic_path["PATH_TO_SUMO_CONF"] env = DIC_ENVS[dic_traffic_env_conf["SIMULATOR_TYPE"]]( path_to_log=dic_path["PATH_TO_LOG"], path_to_work_directory=path_to_work_directory, dic_traffic_env_conf=dic_traffic_env_conf) elif dic_traffic_env_conf['SIMULATOR_TYPE'] == 'anon': env = DIC_ENVS[dic_traffic_env_conf["SIMULATOR_TYPE"]]( path_to_log=dic_path["PATH_TO_LOG"], path_to_work_directory=dic_path["PATH_TO_DATA"], dic_traffic_env_conf=dic_traffic_env_conf) dic_agent_conf["PHI"] = 5 dic_agent_conf["MIN_GREEN_VEC"] = 3 dic_agent_conf["MAX_RED_VEC"] = 6 demo_path = "../frap/demo_{}.p".format(dic_exp_conf["TRAFFIC_FILE"]) with open(demo_path, 'rb') as f: demo_transitions = pickle.load(f) demo_transitions = deque( itertools.islice(demo_transitions, 0, Config.demo_buffer_size)) sess = tf.InteractiveSession() actor = Actor(sess=sess, n_features=16, n_actions=8, dic_traffic_env_conf=dic_traffic_env_conf, lr=1e-3) # 初始化Actor critic = Critic(sess=sess, n_features=16, config=DQfDConfig(), dic_traffic_env_conf=dic_traffic_env_conf, demo=demo_transitions, lr=1e-3) # 初始化Critic # agent = Agent(sess=sess, n_features=16, config=DQfDConfig(), dic_traffic_env_conf=dic_traffic_env_conf, # demo=demo_transitions, lr=1e-3) # actor = Actor(sess=sess, n_features=16, n_actions=8, dic_traffic_env_conf=dic_traffic_env_conf, lr=1e-3) # critic = Critic(sess=sess, n_features=16, lr=1e-3) sess.run(tf.global_variables_initializer()) # 初始化参数 for i in range(10): state, action = critic.train_Q_network(pre_train=True) actor.pretrain(state, action) for i in range(501): done = False state = env.reset() step_num = 0 while not done and step_num < int( dic_exp_conf["EPISODE_LEN"] / dic_traffic_env_conf["MIN_ACTION_TIME"]): action_list = [] for one_state in state: s = convert_to_input(one_state, dic_traffic_env_conf) action, probs = actor.choose_action( s) # one for multi-state, the other for multi-intersection # action, probs = agent.choose_action(s) action_list.append(action) # for multi-state next_state, reward, done, ave_reward = env.step(action_list) s = convert_to_input(state[0], dic_traffic_env_conf) s_ = convert_to_input(next_state[0], dic_traffic_env_conf) next_action, _ = actor.choose_action(s_) # next_action, _ = agent.choose_action(s_) # # # q_a = critic.learn(s, np.array(ave_reward), s_, np.array([action]), np.array([next_action]), probs) # q_a = critic.learn(s, np.array(ave_reward), s_, np.array([action]), np.array([next_action])) if i != 0: q_a = critic.learn(s, np.array(ave_reward), s_, np.array([action]), np.array([next_action]), probs) # q_a = critic.learn(s, np.array(reward), s_) actor.learn(s, np.array([action]), q_a) # agent.learn_actor(s, np.array([action]), q_a) state = next_state step_num += 1 # if i % 3 == 0 and i != 0: # critic.sess.run(critic.update_target_net) env.bulk_log(i) write_summary(dic_path, dic_exp_conf, i)