def test_agent(): state_space_dim = 3 action_space_dim = 4 train = Train() agent = Agent(state_space_dim=state_space_dim, action_space_dim=action_space_dim, low_action=-1, high_action=1, load=False) state = np.random.rand((state_space_dim))[None] next_state = np.random.rand((state_space_dim))[None] action = agent.get_action(state) reward = np.array([1]) done = np.array([0]) Q_loss, policy_loss = train(agent, state, next_state, action, reward, done) assert (True)
def play(ctx, steps, noise): env, state_space_dim, action_space_dim, state_norm_array, min_action, \ max_action = setup_env() # noise_process = OUNoise( # dim=action_space_dim, # sigma=SIGMA, # theta=THETA, # dt=1e-2) # noise_process = NormalNoise( # dim=action_space_dim, # sigma=SIGMA) # noise_process = LinearSegmentNoise( # dim=action_space_dim, # sigma=SIGMA) noise_process = SmoothNoiseND(steps=steps, dim=action_space_dim, sigma=SIGMA) agent = Agent(state_space_dim, action_space_dim, layer_dims=LAYERS_DIMS, low_action=min_action, high_action=max_action, noise_process=noise_process, load=True) state = env.reset() agent.actor.summary() agent.critic.summary() for i in range(steps): action = agent.get_action(state[None], with_exploration=noise)[0] state, reward, done, _ = env \ .step(action) state = state env.render()
def train(ctx, episodes, steps): logger = Logging([ 'episode', 'rewards', 'running_40_episode_reward', 'episode_length', 'epsiode_run_time', 'average_step_run_time', 'q_loss', 'p_loss' ]) env, state_space_dim, action_space_dim, state_norm_array, min_action, \ max_action = setup_env() replay_buffer = ReplayBuffer(state_space_dim=state_space_dim, action_space_dim=action_space_dim, size=BUFFER_SIZE, sample_size=BATCH_SIZE) # noise_process = OUNoise( # dim=action_space_dim, # sigma=SIGMA, # theta=THETA, # dt=1e-2) # noise_process = NormalNoise( # dim=action_space_dim, # sigma=SIGMA) # noise_process = LinearSegmentNoise( # dim=action_space_dim, # sigma=SIGMA) noise_process = SmoothNoiseND(steps=steps, dim=action_space_dim, sigma=SIGMA) agent = Agent(state_space_dim, action_space_dim, layer_dims=LAYERS_DIMS, low_action=min_action, high_action=max_action, noise_process=noise_process, tau=TAU, load=True) train = Train(discount_factor=DISCOUNT, actor_learning_rate=ACTOR_LR, critic_learning_rate=CRITIC_LR) training_rewards = [] for episode in range(episodes): noise_process.reset() state = np.array(env.reset(), dtype='float32') episode_reward = 0 step_count = 0 done = False episode_start_time = time() step_times = [] q_losses = [] p_losses = [] while not done: if step_count >= steps: break step_time_start = time() step_count += 1 # environment step action = agent.get_action(state[None], with_exploration=True)[0] next_state, reward, done, _ = env.step(action) replay_buffer.push((state, next_state, action, reward, done)) state = next_state # training step if replay_buffer.ready: states, next_states, actions, \ rewards, dones = replay_buffer.sample() q_loss, p_loss = \ train(agent, states, next_states, actions, rewards, dones) agent.track_weights() if replay_buffer.ready: q_losses.append(q_loss.numpy()) p_losses.append(p_loss.numpy()) episode_reward += reward step_time_end = time() step_times.append(step_time_end - step_time_start) training_rewards.append(episode_reward) episode_end_time = time() epsiode_time = episode_end_time - episode_start_time average_step_time = np.array(step_times).mean() average_q_loss = np.array(q_losses).mean() average_p_loss = np.array(p_losses).mean() running_40_episode_reward = np.mean(training_rewards[-40:]) logger.log([ episode, episode_reward, running_40_episode_reward, step_count, epsiode_time, average_step_time, average_q_loss, average_p_loss ]) agent.save_models()