def train(params, model_name, save_interval=1000, eval_interval=200, record_episodes=True, restart=False): try: # Create test env print("[INFO] Creating test environment") test_env = gym.make(env_name) # Traning parameters initial_lr = params["initial_lr"] discount_factor = params["discount_factor"] gae_lambda = params["gae_lambda"] ppo_epsilon = params["ppo_epsilon"] value_scale = params["value_scale"] entropy_scale = params["entropy_scale"] horizon = params["horizon"] num_epochs = params["num_epochs"] batch_size = params["batch_size"] num_envs = params["num_envs"] # Training parameters def lr_scheduler(step_idx): return initial_lr * \ 0.85 ** (step_idx // 10000) # Environment constants frame_stack_size = 4 input_shape = (84, 84, frame_stack_size) num_actions = test_env.action_space.shape[0] action_min = test_env.action_space.low action_max = test_env.action_space.high # Create model print("[INFO] Creating model") model = PPO(input_shape, num_actions, action_min, action_max, epsilon=ppo_epsilon, value_scale=value_scale, entropy_scale=entropy_scale, model_name=model_name) print("[INFO] Creating environments") envs = SubprocVecEnv([make_env for _ in range(num_envs)]) initial_frames = envs.reset() envs.get_images() frame_stacks = [FrameStack(initial_frames[i], stack_size=frame_stack_size, preprocess_fn=preprocess_frame) for i in range(num_envs)] print("[INFO] Training loop") while True: # While there are running environments states, taken_actions, values, rewards, dones = [], [], [], [], [] # Simulate game for some number of steps for _ in range(horizon): # Predict and value action given state # π(a_t | s_t; θ_old) states_t = [frame_stacks[i].get_state() for i in range(num_envs)] actions_t, values_t = model.predict(states_t) # Sample action from a Gaussian distribution envs.step_async(actions_t) frames, rewards_t, dones_t, _ = envs.step_wait() envs.get_images() # render # Store state, action and reward # [T, N, 84, 84, 4] states.append(states_t) taken_actions.append(actions_t) # [T, N, 3] values.append(np.squeeze(values_t, axis=-1)) # [T, N] rewards.append(rewards_t) # [T, N] dones.append(dones_t) # [T, N] # Get new state for i in range(num_envs): # Reset environment's frame stack if done if dones_t[i]: for _ in range(frame_stack_size): frame_stacks[i].add_frame(frames[i]) else: frame_stacks[i].add_frame(frames[i]) # Calculate last values (bootstrap values) states_last = [frame_stacks[i].get_state() for i in range(num_envs)] last_values = np.squeeze(model.predict( states_last)[1], axis=-1) # [N] advantages = compute_gae( rewards, values, last_values, dones, discount_factor, gae_lambda) advantages = (advantages - advantages.mean()) / \ (advantages.std() + 1e-8) # Move down one line? returns = advantages + values # Flatten arrays states = np.array(states).reshape( (-1, *input_shape)) # [T x N, 84, 84, 4] taken_actions = np.array(taken_actions).reshape( (-1, num_actions)) # [T x N, 3] # [T x N] returns = returns.flatten() # [T X N] advantages = advantages.flatten() T = len(rewards) N = num_envs assert states.shape == ( T * N, input_shape[0], input_shape[1], frame_stack_size) assert taken_actions.shape == (T * N, num_actions) assert returns.shape == (T * N,) assert advantages.shape == (T * N,) # Train for some number of epochs model.update_old_policy() # θ_old <- θ for _ in range(num_epochs): num_samples = len(states) indices = np.arange(num_samples) np.random.shuffle(indices) for i in range(int(np.ceil(num_samples / batch_size))): # Evaluate model if model.step_idx % eval_interval == 0: print("[INFO] Running evaluation...") avg_reward, value_error = evaluate( model, test_env, discount_factor, frame_stack_size, make_video=True) model.write_to_summary("eval_avg_reward", avg_reward) model.write_to_summary("eval_value_error", value_error) # Save model if model.step_idx % save_interval == 0: model.save() # Sample mini-batch randomly begin = i * batch_size end = begin + batch_size if end > num_samples: end = None mb_idx = indices[begin:end] # Optimize network model.train(states[mb_idx], taken_actions[mb_idx], returns[mb_idx], advantages[mb_idx]) except KeyboardInterrupt: model.save()
def main(): # Create test env print("Creating test environment") test_env = gym.make(env_name) # Traning parameters lr_scheduler = Scheduler(initial_value=3e-4, interval=1000, decay_factor=1) #0.75) std_scheduler = Scheduler(initial_value=2.0, interval=1000, decay_factor=0.75) discount_factor = 0.99 gae_lambda = 0.95 ppo_epsilon = 0.2 t_max = 10 #180 num_epochs = 10 batch_size = 40 #64 save_interval = 500 eval_interval = 100 training = True # Environment constants frame_stack_size = 4 input_shape = (84, 84, frame_stack_size) num_actions = 1 #envs.action_space.shape[0] action_min = np.array([-1.0]) #np.array([-1.0, 0.0, 0.0]) action_max = np.array([1.0]) #np.array([ 1.0, 1.0, 1.0]) # Create model print("Creating model") model_checkpoint = None #"./models/CarRacing-v0/run2/episode0_step455000.ckpt" model = PPO(num_actions, input_shape, action_min, action_max, ppo_epsilon, value_scale=0.5, entropy_scale=0.0001, model_checkpoint=model_checkpoint, model_name="CarRacing-v0") if training: print("Creating environments") num_envs = 4 envs = SubprocVecEnv([make_env for _ in range(num_envs)]) initial_frames = envs.reset() initial_frames = envs.get_images() frame_stacks = [ FrameStack(initial_frames[i], preprocess_fn=preprocess_frame) for i in range(num_envs) ] print("Main loop") step = 0 while training: # While there are running environments print("Training...") states, taken_actions, values, rewards, dones = [], [], [], [], [] learning_rate = np.maximum(lr_scheduler.get_value(), 1e-6) std = np.maximum(std_scheduler.get_value(), 0.2) # Simulate game for some number of steps for _ in range(t_max): # Predict and value action given state # π(a_t | s_t; θ_old) states_t = [ frame_stacks[i].get_state() for i in range(num_envs) ] actions_t, values_t = model.predict(states_t, use_old_policy=True, std=std) for i in range(num_envs): actions_t[i] = 0 if actions_t[i] < 0 else 1 actions_t = np.squeeze(actions_t.astype(np.int32), axis=-1) # Sample action from a Gaussian distribution envs.step_async(actions_t) frames, rewards_t, dones_t, infos = envs.step_wait() frames = envs.get_images() # render # Store state, action and reward states.append(states_t) # [T, N, 84, 84, 1] taken_actions.append(actions_t) # [T, N, 3] values.append(np.squeeze(values_t, axis=-1)) # [T, N] rewards.append(rewards_t) # [T, N] dones.append(dones_t) # [T, N] # Get new state for i in range(num_envs): frame_stacks[i].add_frame(frames[i]) # Calculate last values (bootstrap values) states_last = [ frame_stacks[i].get_state() for i in range(num_envs) ] last_values = np.squeeze(model.predict(states_last)[-1], axis=-1) # [N] # Compute returns returns = compute_returns(rewards, last_values, dones, discount_factor) # Compute advantages advantages = compute_gae(rewards, values, last_values, dones, discount_factor, gae_lambda) # Normalize advantages advantages = (advantages - np.mean(advantages)) / np.std(advantages) # Flatten arrays states = np.array(states).reshape( (-1, *input_shape)) # [T x N, 84, 84, 1] taken_actions = np.array(taken_actions).reshape( (-1, num_actions)) # [T x N, 3] returns = returns.flatten() # [T x N] advantages = advantages.flatten() # [T X N] # Train for some number of epochs model.update_old_policy() # θ_old <- θ for _ in range(num_epochs): # Sample mini-batch randomly and train mb_idx = np.random.choice(len(states), batch_size, replace=False) # Optimize network model.train(states[mb_idx], taken_actions[mb_idx], returns[mb_idx], advantages[mb_idx], learning_rate=learning_rate, std=std) # Reset environment's frame stack if done for i, done in enumerate(dones_t): if done: frame_stacks[i].add_frame(frames[i]) # Save model step += 1 if step % save_interval == 0: model.save() if step % eval_interval == 0: avg_reward = evaluate(model, test_env, 10) model.write_to_summary("eval_avg_reward", avg_reward) # Training complete, evaluate model avg_reward = evaluate(model, test_env, 10) print("Model achieved a final reward of:", avg_reward)