RL = PolicyGradient( n_actions=env.action_space.n, n_features=env.observation_space.shape[0], learning_rate=0.02, reward_decay=0.995, # output_graph=True, ) for eposide_i in range(1000): observation = env.reset() while True: if RENDER: env.render() action = RL.choose_action(observation) observation_, reward, done, info = env.step(action) RL.store_transition(observation, action, reward) if done: ep_rs_sum = sum(RL.ep_rs) if "running_reward" not in globals(): running_reward = ep_rs_sum else: running_reward = running_reward * 0.99 + ep_rs_sum * 0.01 if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = False #True # rendering print("episode:", eposide_i, " reward:", int(running_reward))
RL = PolicyGradient( n_actions=env.action_space.n, n_features=env.observation_space.shape[0], learning_rate=0.02, reward_decay=0.995, # output_graph=True, ) for i_episode in range(1000): observation = env.reset() while True: if RENDER: env.render() action = RL.choose_action(observation) observation_, reward, done, info = env.step(action) # reward = -1 in all cases RL.store_transition(observation, action, reward) if done: # calculate running reward ep_rs_sum = sum(RL.ep_rs) if 'running_reward' not in globals(): running_reward = ep_rs_sum else: running_reward = running_reward * 0.99 + ep_rs_sum * 0.01 if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True # rendering print("episode:", i_episode, " reward:", int(running_reward))
print("n_features=", RL.n_features) i_episode = 0 # for i_episode in range(60000): while True: i_episode += 1 state = env.reset() done = False user = 0 reward1 = reward2 = 0 while not done: if user == 0: action1 = RL.choose_action(state) state1, reward1, done, infos = env.step(action1, -1) if done: RL.store_transition(state, action1, reward1) state = state1 reward1 = reward2 = 0 elif user == 1: while True: random_act = env.action_space.sample() x = random_act % 3 y = random_act // 3 found = False for i in range(0, 27, 3): chunk = state1[i : i + 3] # print("chunk=",chunk) if ([x,y,1] == chunk).all():
for i_episode in range(600): if i_episode in range(0, 300): env = relay_net_slow #env,dummy= create_example_env() state, reward = env.update_state() state = np.array(state) if i_episode in range(300, 600): env = relay_net_Reduce state, reward = env.update_state() state = np.array(state) for i in range(steps_per_episode): action_id = RL.choose_action(state) action_space = env.get_possible_action_space() action = action_space[action_id] if random_action: action = action_space[np.random.randint(len(action_space))] env.apply_action(action) next_state, reward = env.update_state() next_state = np.array(next_state) reward = postprocessreward(reward, th) RL.store_transition(state, action_id, reward) state = next_state if reward > 1:
n_actions=env.action_space.n, n_features=env.observation_space.shape[0], learning_rate=0.02, reward_decay=0.99, # output_graph=True, ) for i_episode in range(3000): observation = env.reset() while True: if RENDER: env.render() action = RL.choose_action( observation ) #agent根据策略\pi进行探索,直到探索结束. 一轮探索的所有结果<observation, action, reward>存储在记忆库中,用于训练 observation_, reward, done, info = env.step( action) #所以policy gradient是非常耗时的,大多数时间都花费在与环境交互上 RL.store_transition(observation, action, reward) #每一轮探索都会将相关的东西存储到replay if done: ep_rs_sum = sum(RL.ep_rs) #所有奖励值之和 if 'running_reward' not in globals(): running_reward = ep_rs_sum else: running_reward = running_reward * 0.99 + ep_rs_sum * 0.01 if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True # rendering