예제 #1
0
def test_agent():
    state_space_dim = 3
    action_space_dim = 4
    train = Train()
    agent = Agent(state_space_dim=state_space_dim,
                  action_space_dim=action_space_dim,
                  low_action=-1,
                  high_action=1,
                  load=False)
    state = np.random.rand((state_space_dim))[None]
    next_state = np.random.rand((state_space_dim))[None]
    action = agent.get_action(state)
    reward = np.array([1])
    done = np.array([0])
    Q_loss, policy_loss = train(agent, state, next_state, action, reward, done)
    assert (True)
예제 #2
0
def play(ctx, steps, noise):
    env, state_space_dim, action_space_dim, state_norm_array, min_action, \
        max_action = setup_env()

    # noise_process = OUNoise(
    #     dim=action_space_dim,
    #     sigma=SIGMA,
    #     theta=THETA,
    #     dt=1e-2)

    # noise_process = NormalNoise(
    #     dim=action_space_dim,
    #     sigma=SIGMA)

    # noise_process = LinearSegmentNoise(
    #     dim=action_space_dim,
    #     sigma=SIGMA)

    noise_process = SmoothNoiseND(steps=steps,
                                  dim=action_space_dim,
                                  sigma=SIGMA)

    agent = Agent(state_space_dim,
                  action_space_dim,
                  layer_dims=LAYERS_DIMS,
                  low_action=min_action,
                  high_action=max_action,
                  noise_process=noise_process,
                  load=True)

    state = env.reset()

    agent.actor.summary()
    agent.critic.summary()
    for i in range(steps):
        action = agent.get_action(state[None], with_exploration=noise)[0]
        state, reward, done, _ = env \
            .step(action)
        state = state
        env.render()
예제 #3
0
def train(ctx, episodes, steps):
    logger = Logging([
        'episode', 'rewards', 'running_40_episode_reward', 'episode_length',
        'epsiode_run_time', 'average_step_run_time', 'q_loss', 'p_loss'
    ])

    env, state_space_dim, action_space_dim, state_norm_array, min_action, \
        max_action = setup_env()
    replay_buffer = ReplayBuffer(state_space_dim=state_space_dim,
                                 action_space_dim=action_space_dim,
                                 size=BUFFER_SIZE,
                                 sample_size=BATCH_SIZE)

    # noise_process = OUNoise(
    #     dim=action_space_dim,
    #     sigma=SIGMA,
    #     theta=THETA,
    #     dt=1e-2)

    # noise_process = NormalNoise(
    #     dim=action_space_dim,
    #     sigma=SIGMA)

    # noise_process = LinearSegmentNoise(
    #     dim=action_space_dim,
    #     sigma=SIGMA)

    noise_process = SmoothNoiseND(steps=steps,
                                  dim=action_space_dim,
                                  sigma=SIGMA)

    agent = Agent(state_space_dim,
                  action_space_dim,
                  layer_dims=LAYERS_DIMS,
                  low_action=min_action,
                  high_action=max_action,
                  noise_process=noise_process,
                  tau=TAU,
                  load=True)

    train = Train(discount_factor=DISCOUNT,
                  actor_learning_rate=ACTOR_LR,
                  critic_learning_rate=CRITIC_LR)

    training_rewards = []
    for episode in range(episodes):
        noise_process.reset()
        state = np.array(env.reset(), dtype='float32')
        episode_reward = 0
        step_count = 0
        done = False
        episode_start_time = time()
        step_times = []
        q_losses = []
        p_losses = []
        while not done:
            if step_count >= steps:
                break

            step_time_start = time()
            step_count += 1

            # environment step
            action = agent.get_action(state[None], with_exploration=True)[0]
            next_state, reward, done, _ = env.step(action)
            replay_buffer.push((state, next_state, action, reward, done))
            state = next_state

            # training step
            if replay_buffer.ready:
                states, next_states, actions, \
                    rewards, dones = replay_buffer.sample()
                q_loss, p_loss = \
                    train(agent, states, next_states,
                          actions, rewards, dones)
                agent.track_weights()

            if replay_buffer.ready:
                q_losses.append(q_loss.numpy())
                p_losses.append(p_loss.numpy())
            episode_reward += reward
            step_time_end = time()
            step_times.append(step_time_end - step_time_start)
        training_rewards.append(episode_reward)
        episode_end_time = time()
        epsiode_time = episode_end_time - episode_start_time
        average_step_time = np.array(step_times).mean()
        average_q_loss = np.array(q_losses).mean()
        average_p_loss = np.array(p_losses).mean()
        running_40_episode_reward = np.mean(training_rewards[-40:])

        logger.log([
            episode, episode_reward, running_40_episode_reward, step_count,
            epsiode_time, average_step_time, average_q_loss, average_p_loss
        ])

        agent.save_models()