Пример #1
0
def run(seed, episodes, evaluation_episodes, batch_size, gamma,
        inverting_gradients, initial_memory_threshold, replay_memory_size,
        scale_actions, epsilon_steps, epsilon_final, tau_actor, tau_critic,
        use_ornstein_noise, learning_rate_actor, learning_rate_critic,
        reward_scale, clip_grad, initialise_params, layers, save_dir, title):
    env = gym.make('Goal-v0')
    env = GoalObservationWrapper(env)

    if scale_actions:
        kickto_weights = np.array(
            [[-0.375, 0.5, 0, 0.0625, 0],
             [0, 0, 0.8333333333333333333, 0, 0.111111111111111111111111]])
        shoot_goal_left_weights = np.array([0.857346647646219686, 0])
        shoot_goal_right_weights = np.array([-0.857346647646219686, 0])
    else:
        xfear = 50.0 / PITCH_LENGTH
        yfear = 50.0 / PITCH_WIDTH
        caution = 5.0 / PITCH_WIDTH
        kickto_weights = np.array([[2.5, 1, 0, xfear, 0],
                                   [0, 0, 1 - caution, 0, yfear]])
        shoot_goal_left_weights = np.array([GOAL_WIDTH / 2 - 1, 0])
        shoot_goal_right_weights = np.array([-GOAL_WIDTH / 2 + 1, 0])

    initial_weights = np.zeros((4, 17))
    initial_weights[0, [10, 11, 14, 15]] = kickto_weights[0, 1:]
    initial_weights[1, [10, 11, 14, 15]] = kickto_weights[1, 1:]
    initial_weights[2, 16] = shoot_goal_left_weights[1]
    initial_weights[3, 16] = shoot_goal_right_weights[1]

    initial_bias = np.zeros((4, ))
    initial_bias[0] = kickto_weights[0, 0]
    initial_bias[1] = kickto_weights[1, 0]
    initial_bias[2] = shoot_goal_left_weights[0]
    initial_bias[3] = shoot_goal_right_weights[0]

    env = GoalFlattenedActionWrapper(env)
    if scale_actions:
        env = ScaledParameterisedActionWrapper(env)
    env = ScaledStateWrapper(env)
    dir = os.path.join(save_dir, title)
    env = Monitor(env,
                  directory=os.path.join(dir, str(seed)),
                  video_callable=False,
                  write_upon_reset=False,
                  force=True)
    print(env.action_space)
    print(env.observation_space)
    env.seed(seed)
    np.random.seed(seed)
    agent = PADDPGAgent(
        observation_space=env.observation_space.spaces[0],
        action_space=env.action_space,
        batch_size=batch_size,
        learning_rate_actor=learning_rate_actor,
        learning_rate_critic=learning_rate_critic,
        epsilon_steps=epsilon_steps,
        epsilon_final=epsilon_final,
        gamma=gamma,
        clip_grad=clip_grad,
        tau_actor=tau_actor,
        tau_critic=tau_critic,
        initial_memory_threshold=initial_memory_threshold,
        use_ornstein_noise=use_ornstein_noise,
        replay_memory_size=replay_memory_size,
        inverting_gradients=inverting_gradients,
        n_step_returns=False,
        adam_betas=(0.9, 0.999),
        critic_kwargs={
            'hidden_layers': layers,
            'init_type': "kaiming"
        },
        actor_kwargs={
            'hidden_layers': layers,
            'init_type': "kaiming",  # 'init_std': 1e-5,  # 0.0001,
            'squashing_function': False
        },
        seed=seed)

    if initialise_params:
        agent.set_action_parameter_passthrough_weights(initial_weights,
                                                       initial_bias)
    print(agent)
    max_steps = 150
    total_reward = 0.
    returns = []
    start_time = time.time()

    log_f = open("log_paddpg_GoalEnv.txt", "w+")

    for i in range(episodes):
        state, _ = env.reset()
        state = np.array(state, dtype=np.float32, copy=False)

        act, act_param, all_actions, all_action_parameters = agent.act(state)
        action = pad_action(act, act_param)

        episode_reward = 0.
        agent.start_episode()
        for j in range(max_steps):
            ret = env.step(action)
            (next_state, steps), reward, terminal, _ = ret
            next_state = np.array(next_state, dtype=np.float32, copy=False)

            next_act, next_act_param, next_all_actions, next_all_action_parameters = agent.act(
                next_state)
            next_action = pad_action(next_act, next_act_param)

            r = reward * reward_scale
            agent.step(state,
                       (act, act_param, all_actions, all_action_parameters),
                       r,
                       next_state, (next_act, next_act_param, next_all_actions,
                                    next_all_action_parameters),
                       terminal,
                       optimise=True)

            act, act_param, all_actions, all_action_parameters = next_act, next_act_param, next_all_actions, next_all_action_parameters
            action = next_action
            state = next_state
            episode_reward += reward

            if terminal:
                break
        agent.end_episode()

        returns.append(episode_reward)
        total_reward += episode_reward
        if (i + 1) % 100 == 0:
            print('{0:5s} R:{1:.5f} P(S):{2:.4f}'.format(
                str(i + 1), total_reward / (i + 1),
                (np.array(returns) == 50.).sum() / len(returns)))

            # from left to right: episode number, episode reward, averaged total reward for all past episodes,
            # returns for nearest 100 episodes and success rates
            log_f.write('{},{},{},{},{}\n'.format(
                i, episode_reward, total_reward / (i + 1),
                np.array(returns[-100:]).mean(),
                (np.array(returns) == 50.).sum() / len(returns)))

            log_f.flush()

    end_time = time.time()
    print("Took %.2f seconds" % (end_time - start_time))
    env.close()
    print(agent)

    returns = env.get_episode_rewards()
    np.save(os.path.join(dir, title + "{}".format(str(seed))), returns)

    if evaluation_episodes > 0:
        print("Evaluating agent over {} episodes".format(evaluation_episodes))
        agent.epsilon_final = 0.
        agent.epsilon = 0.
        agent.noise = None
        evaluation_returns = evaluate(env, agent, evaluation_episodes)
        print("Ave. evaluation return =",
              sum(evaluation_returns) / len(evaluation_returns))
        print("Ave. evaluation prob. =",
              sum(evaluation_returns == 50.) / len(evaluation_returns))
        np.save(os.path.join(dir, title + "{}e".format(str(seed))),
                evaluation_returns)
Пример #2
0
def run(seed, episodes, evaluation_episodes, batch_size, gamma,
        inverting_gradients, initial_memory_threshold, replay_memory_size,
        epsilon_steps, epsilon_final, tau_actor, tau_actor_param,
        tau_actor_param_critic, use_ornstein_noise, learning_rate_actor,
        learning_rate_actor_param, learning_rate_actor_param_critic,
        reward_scale, clip_grad, title, scale_actions, zero_index_gradients,
        split, layers, multipass, indexed, weighted, average, random_weighted,
        render_freq, action_input_layer, initialise_params, save_freq,
        save_dir, save_frames, visualise):

    env = gym.make('Goal-v0')
    env = GoalObservationWrapper(env)

    if save_freq > 0 and save_dir:
        save_dir = os.path.join(save_dir, title + "{}".format(str(seed)))
        os.makedirs(save_dir, exist_ok=True)
    assert not (save_frames and visualise)
    if visualise:
        assert render_freq > 0
    if save_frames:
        assert render_freq > 0
        vidir = os.path.join(save_dir, "frames")
        os.makedirs(vidir, exist_ok=True)

    if scale_actions:
        kickto_weights = np.array(
            [[-0.375, 0.5, 0, 0.0625, 0],
             [0, 0, 0.8333333333333333333, 0, 0.111111111111111111111111]])
        shoot_goal_left_weights = np.array([0.857346647646219686, 0])
        shoot_goal_right_weights = np.array([-0.857346647646219686, 0])
    else:
        xfear = 50.0 / PITCH_LENGTH
        yfear = 50.0 / PITCH_WIDTH
        caution = 5.0 / PITCH_WIDTH
        kickto_weights = np.array([[2.5, 1, 0, xfear, 0],
                                   [0, 0, 1 - caution, 0, yfear]])
        shoot_goal_left_weights = np.array([GOAL_WIDTH / 2 - 1, 0])
        shoot_goal_right_weights = np.array([-GOAL_WIDTH / 2 + 1, 0])

    initial_weights = np.zeros((4, 20))  #np.zeros((4, 17))
    initial_weights[0, [10, 11, 14, 15]] = kickto_weights[0, 1:]
    initial_weights[1, [10, 11, 14, 15]] = kickto_weights[1, 1:]
    initial_weights[2, 16] = shoot_goal_left_weights[1]
    initial_weights[3, 16] = shoot_goal_right_weights[1]

    initial_bias = np.zeros((4, ))
    initial_bias[0] = kickto_weights[0, 0]
    initial_bias[1] = kickto_weights[1, 0]
    initial_bias[2] = shoot_goal_left_weights[0]
    initial_bias[3] = shoot_goal_right_weights[0]

    if not scale_actions:
        # rescale initial action-parameters for a scaled state space
        for a in range(env.action_space.spaces[0].n):
            mid = (env.observation_space.spaces[0].high +
                   env.observation_space.spaces[0].low) / 2.
            initial_bias[a] += np.sum(initial_weights[a] * mid)
            initial_weights[
                a] = initial_weights[a] * env.observation_space.spaces[
                    0].high - initial_weights[a] * mid

    env = GoalFlattenedActionWrapper(env)
    if scale_actions:
        env = ScaledParameterisedActionWrapper(env)
    env = ScaledStateWrapper(env)
    dir = os.path.join(save_dir, title)
    env = Monitor(env,
                  directory=os.path.join(dir, str(seed)),
                  video_callable=False,
                  write_upon_reset=False,
                  force=True)
    env.seed(seed)
    np.random.seed(seed)

    assert not (split and multipass)
    agent_class = HHQNAgent

    agent = agent_class(
        observation_space=env.observation_space.spaces[0],
        action_space=env.action_space,
        batch_size=batch_size,
        learning_rate_actor=learning_rate_actor,  # 0.0001
        learning_rate_actor_param=learning_rate_actor_param,  # 0.001
        learning_rate_actor_param_critic=learning_rate_actor_param_critic,
        epsilon_steps=epsilon_steps,
        epsilon_final=epsilon_final,
        gamma=gamma,
        clip_grad=clip_grad,
        indexed=indexed,
        average=average,
        random_weighted=random_weighted,
        tau_actor=tau_actor,
        weighted=weighted,
        tau_actor_param=tau_actor_param,
        tau_actor_param_critic=tau_actor_param_critic,
        initial_memory_threshold=initial_memory_threshold,
        use_ornstein_noise=use_ornstein_noise,
        replay_memory_size=replay_memory_size,
        inverting_gradients=inverting_gradients,
        actor_kwargs={
            'hidden_layers': layers,
            'output_layer_init_std': 1e-5,
            'action_input_layer': action_input_layer,
        },
        actor_param_kwargs={
            'hidden_layers': layers,
            'output_layer_init_std': 1e-5,
            'squashing_function': False
        },
        zero_index_gradients=zero_index_gradients,
        seed=seed)

    if initialise_params:
        agent.set_action_parameter_passthrough_weights(initial_weights,
                                                       initial_bias)
    print(agent)
    max_steps = 150
    total_reward = 0.
    returns = []
    start_time = time.time()
    video_index = 0
    Reward = []
    possibility = []
    for i in range(episodes):
        if save_freq > 0 and save_dir and i % save_freq == 0:
            agent.save_models(os.path.join(save_dir, str(i)))

        state, _ = env.reset()
        state = np.array(state, dtype=np.float32, copy=False)
        act, act_param, all_action_parameters = agent.act(state)
        action = pad_action(act, act_param)

        if visualise and i % render_freq == 0:
            env.render()

        episode_reward = 0.
        agent.start_episode()
        for j in range(max_steps):
            ret = env.step(action)
            (next_state, steps), reward, terminal, _ = ret
            next_state = np.array(next_state, dtype=np.float32, copy=False)

            next_act, next_act_param, next_all_action_parameters = agent.act(
                next_state)
            next_action = pad_action(next_act, next_act_param)
            r = reward * reward_scale
            agent.step(state, (act, all_action_parameters), r, next_state,
                       (next_act, next_all_action_parameters), terminal, steps)
            act, act_param, all_action_parameters = next_act, next_act_param, next_all_action_parameters
            action = next_action
            state = next_state
            episode_reward += reward

            if visualise and i % render_freq == 0:
                env.render()

            if terminal:
                break
        agent.end_episode()

        if save_frames:
            video_index = env.unwrapped.save_render_states(
                vidir, title, video_index)

        returns.append(episode_reward)
        total_reward += episode_reward
        if (i + 1) % 100 == 0:
            print('{0:5s} R:{1:.5f} P(S):{2:.4f}'.format(
                str(i + 1), total_reward / (i + 1),
                (np.array(returns) == 50.).sum() / len(returns)))
        Reward.append(total_reward / (i + 1))
        possibility.append((np.array(returns) == 50.).sum() / len(returns))

    plot_reward(Reward)
    plot_p(possibility)
    end_time = time.time()
    print("Training took %.2f seconds" % (end_time - start_time))
    env.close()

    if save_freq > 0 and save_dir:
        agent.save_models(os.path.join(save_dir, str(i)))

    returns = env.get_episode_rewards()
    np.save(os.path.join(dir, title + "{}".format(str(seed))), returns)

    if evaluation_episodes > 0:
        print("Evaluating agent over {} episodes".format(evaluation_episodes))
        agent.epsilon_final = 0.
        agent.epsilon = 0.
        agent.noise = None
        evaluation_returns = evaluate(env, agent, evaluation_episodes)
        print("Ave. evaluation return =",
              sum(evaluation_returns) / len(evaluation_returns))
        print("Ave. evaluation prob. =",
              sum(evaluation_returns == 50.) / len(evaluation_returns))
        np.save(os.path.join(dir, title + "{}e".format(str(seed))),
                evaluation_returns)
Пример #3
0
def run(seed, episodes, evaluation_episodes, scale, initialise_params,
        save_dir, title):
    alpha_param = 0.1

    env = gym.make('Goal-v0')
    env = GoalObservationWrapper(env)
    if scale:
        variances[0] = 0.0001
        variances[1] = 0.0001
        variances[2] = 0.0001
        alpha_param = 0.06
        initial_parameter_weights[0] = np.array(
            [[-0.375, 0.5, 0, 0.0625, 0],
             [0, 0, 0.8333333333333333333, 0, 0.111111111111111111111111]])
        initial_parameter_weights[1] = np.array([0.857346647646219686, 0])
        initial_parameter_weights[2] = np.array([-0.857346647646219686, 0])
        env = ScaledStateWrapper(env)
        env = QPAMDPScaledParameterisedActionWrapper(env)

    dir = os.path.join(save_dir, title)
    env = Monitor(env,
                  directory=os.path.join(dir, str(seed)),
                  video_callable=False,
                  write_upon_reset=False,
                  force=True)
    env.seed(seed)
    np.random.seed(seed)

    action_obs_index = np.arange(14)
    param_obs_index = np.array([
        np.array([10, 11, 14, 15]),  # ball_features
        np.array([16]),  # keeper_features
        np.array([16]),  # keeper_features
    ])
    basis = CustomFourierBasis(14, env.observation_space.spaces[0].low[:14],
                               env.observation_space.spaces[0].high[:14])
    discrete_agent = SarsaLambdaAgent(env.observation_space.spaces[0],
                                      env.action_space.spaces[0],
                                      basis=basis,
                                      seed=seed,
                                      alpha=0.01,
                                      lmbda=0.1,
                                      gamma=0.9,
                                      temperature=1.0,
                                      cooling=1.0,
                                      scale_alpha=False,
                                      use_softmax=True,
                                      observation_index=action_obs_index,
                                      gamma_step_adjust=False)
    agent = QPAMDPAgent(
        env.observation_space.spaces[0],
        env.action_space,
        alpha=alpha_param,
        initial_action_learning_episodes=4000,
        seed=seed,
        action_obs_index=action_obs_index,
        parameter_obs_index=param_obs_index,
        variances=variances,
        discrete_agent=discrete_agent,
        action_relearn_episodes=2000,
        parameter_updates=1000,
        parameter_rollouts=50,
        norm_grad=True,
        print_freq=100,
        phi0_func=lambda state: np.array([1, state[1], state[1]**2]),
        phi0_size=3)
    # Alternating learning periods from original paper:
    # QPAMDP(1) : init(2000), parameter_updates(50), relearn(50)
    # QPAMDP(infinity) : init(2000), parameter_updates(1000), relearn(2000)
    # needed to increase initial action learning episodes to 4000

    if initialise_params:
        for a in range(3):
            agent.parameter_weights[a] = initial_parameter_weights[a]

    max_steps = 150
    start_time = time.time()
    agent.learn(env, episodes, max_steps)
    end_time = time.time()

    agent.plot_reward()
    agent.plot_p()
    print("Training took %.2f seconds" % (end_time - start_time))
    env.close()

    returns = np.array(env.get_episode_rewards())
    print("Saving training results to:",
          os.path.join(dir, "QPAMDP{}".format(str(seed))))
    np.save(os.path.join(dir, title + "{}".format(str(seed))), returns)

    print("Ave. return =", sum(returns) / len(returns))
    print("Ave. last 100 episode return =", sum(returns[-100:]) / 100.)
    print('Total P(S):{0:.4f}'.format((returns == 50.).sum() / len(returns)))
    print('Ave. last 100 episode P(S):{0:.4f}'.format(
        (returns[-100:] == 50.).sum() / 100.))

    if evaluation_episodes > 0:
        print("Evaluating agent over {} episodes".format(evaluation_episodes))
        agent.variances = 0
        agent.discrete_agent.epsilon = 0.
        agent.discrete_agent.temperature = 0.
        evaluation_returns = evaluate(env, agent, evaluation_episodes)
        print("Ave. evaluation return =",
              sum(evaluation_returns) / len(evaluation_returns))
        print("Ave. evaluation prob. =",
              sum(evaluation_returns == 50.) / len(evaluation_returns))
        np.save(os.path.join(dir, title + "{}e".format(str(seed))),
                evaluation_returns)