def test_render(): env = SoccerEnv() env.render() action = env.encode_action(SoccerEnv.Action.Stick, SoccerEnv.Action.Stick) env.step(action) env.render() return
ref_P1_action = int(SoccerEnv.Action.S) ref_P2_action = int(SoccerEnv.Action.Stick) # Q errors for plotting Foe_Q_P1_Q_errors = [] for i_episode in range(n_episodes_MAX): state = env.reset() P1_Q_ref = Foe_Q_agent.Q[ref_state, ref_P1_action, ref_P2_action] for t in range(steps_MAX): joint_action = np.random.randint(num_actions) # Take action A, observe R, S' state_new, reward, done, info = env.step(joint_action) # Update Q P1_action, P2_action = env.decode_action(joint_action) P1_reward, P2_reward = env.decode_reward(state, reward) Foe_Q_agent.learn(P1_reward, state, state_new, P1_action, P2_action) state = state_new if done: # if verbose: # print("Episode finished after {} timesteps".format(t + 1)) break # calc error at end of episode update Foe_Q_P1_Q_errors.append(