def train(params, model_name, save_interval=10, eval_interval=10, record_eval=True, restart=False): # Traning parameters learning_rate = params["learning_rate"] lr_decay = params["lr_decay"] discount_factor = params["discount_factor"] gae_lambda = params["gae_lambda"] ppo_epsilon = params["ppo_epsilon"] value_scale = params["value_scale"] entropy_scale = params["entropy_scale"] horizon = params["horizon"] num_epochs = params["num_epochs"] num_episodes = params["num_episodes"] batch_size = params["batch_size"] vae_model = params["vae_model"] vae_model_type = params["vae_model_type"] vae_z_dim = params["vae_z_dim"] if vae_z_dim is None: vae_z_dim = params["vae_z_dim"] = int( re.findall("zdim(\d+)", vae_model)[0]) if vae_model_type is None: vae_model_type = params[ "vae_model_type"] = "mlp" if "mlp" in vae_model else "cnn" VAEClass = MlpVAE if vae_model_type == "mlp" else ConvVAE print("") print("Training parameters:") for k, v, in params.items(): print(f" {k}: {v}") print("") # Load pre-trained variational autoencoder vae = VAEClass(input_shape=(84, 84, 1), z_dim=vae_z_dim, models_dir="vae", model_name=vae_model, training=False) vae.init_session(init_logging=False) if not vae.load_latest_checkpoint(): raise Exception("Failed to load VAE") # State encoding fn with_measurements = False stack = None encode_state_fn = create_encode_state_fn( vae, with_measurements=with_measurements, stack=stack) # Create env print("Creating environment") env = make_env(model_name, frame_skip=0, encode_state_fn=encode_state_fn) test_env = make_env(model_name + " (Test)", encode_state_fn=encode_state_fn) # Environment constants input_shape = np.array([vae_z_dim]) if with_measurements: input_shape[0] += 3 if isinstance(stack, int): input_shape[0] *= stack num_actions = env.action_space.shape[0] action_min = env.action_space.low action_max = env.action_space.high # Create model print("Creating model") model = PPO(input_shape, num_actions, action_min, action_max, learning_rate=learning_rate, lr_decay=lr_decay, epsilon=ppo_epsilon, value_scale=value_scale, entropy_scale=entropy_scale, output_dir=os.path.join("models", model_name)) # Prompt to load existing model if any if not restart: if os.path.isdir(model.log_dir) and len(os.listdir(model.log_dir)) > 0: answer = input( "Model \"{}\" already exists. Do you wish to continue (C) or restart training (R)? " .format(model_name)) if answer.upper() == "C": model.load_latest_checkpoint() elif answer.upper() == "R": restart = True else: raise Exception( "There are already log files for model \"{}\". Please delete it or change model_name and try again" .format(model_name)) if restart: shutil.rmtree(model.output_dir) for d in model.dirs: os.makedirs(d) model.init_logging() model.write_dict_to_summary("hyperparameters", params, 0) # For every episode while model.get_episode_idx() < num_episodes: episode_idx = model.get_episode_idx() # Save model periodically if episode_idx % save_interval == 0: model.save() # Run evaluation periodically if episode_idx % eval_interval == 0: video_filename = os.path.join(model.video_dir, "episode{}.avi".format(episode_idx)) eval_reward, eval_score = test_agent(test_env, model, video_filename=video_filename) model.write_value_to_summary("eval/score", eval_score, episode_idx) model.write_value_to_summary("eval/reward", eval_reward, episode_idx) # Reset environment state, terminal_state, total_reward, total_value = env.reset( ), False, 0, 0 # While episode not done print(f"Episode {episode_idx} (Step {model.get_train_step_idx()})") while not terminal_state: states, taken_actions, values, rewards, dones = [], [], [], [], [] for _ in range(horizon): action, value = model.predict([state], write_to_summary=True) # Show value on-screen env.env.value_label.text = "V(s)={:.2f}".format(value) # Perform action new_state, reward, terminal_state, _ = env.step(action) env.render() total_reward += reward # Store state, action and reward states.append(state) # [T, *input_shape] taken_actions.append(action) # [T, num_actions] values.append(value) # [T] rewards.append(reward) # [T] dones.append(terminal_state) # [T] state = new_state if terminal_state: break # Calculate last value (bootstrap value) _, last_values = model.predict([state]) # [] # Compute GAE advantages = compute_gae(rewards, values, last_values, dones, discount_factor, gae_lambda) returns = advantages + values advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8) # Flatten arrays states = np.array(states) taken_actions = np.array(taken_actions) returns = np.array(returns) advantages = np.array(advantages) T = len(rewards) assert states.shape == (T, *input_shape) assert taken_actions.shape == (T, num_actions) assert returns.shape == (T, ) assert advantages.shape == (T, ) # Train for some number of epochs model.update_old_policy() # θ_old <- θ for _ in range(num_epochs): num_samples = len(states) indices = np.arange(num_samples) np.random.shuffle(indices) for i in range(int(np.ceil(num_samples / batch_size))): # Sample mini-batch randomly begin = i * batch_size end = begin + batch_size if end > num_samples: end = None mb_idx = indices[begin:end] # Optimize network model.train(states[mb_idx], taken_actions[mb_idx], returns[mb_idx], advantages[mb_idx]) # Write episodic values model.write_value_to_summary("train/score", env.env.reward, episode_idx) model.write_value_to_summary("train/reward", total_reward, episode_idx) model.write_value_to_summary("train/value", total_value, episode_idx) model.write_episodic_summaries()
def train(params, start_carla=True, restart=False): # Read parameters learning_rate = params["learning_rate"] lr_decay = params["lr_decay"] discount_factor = params["discount_factor"] gae_lambda = params["gae_lambda"] ppo_epsilon = params["ppo_epsilon"] initial_std = params["initial_std"] value_scale = params["value_scale"] entropy_scale = params["entropy_scale"] horizon = params["horizon"] num_epochs = params["num_epochs"] num_episodes = params["num_episodes"] batch_size = params["batch_size"] vae_model = params["vae_model"] vae_model_type = params["vae_model_type"] vae_z_dim = params["vae_z_dim"] synchronous = params["synchronous"] fps = params["fps"] action_smoothing = params["action_smoothing"] model_name = params["model_name"] reward_fn = params["reward_fn"] seed = params["seed"] eval_interval = params["eval_interval"] record_eval = params["record_eval"] # Set seeds if isinstance(seed, int): tf.random.set_random_seed(seed) np.random.seed(seed) random.seed(0) # Load VAE vae = load_vae(vae_model, vae_z_dim, vae_model_type) # Override params for logging params["vae_z_dim"] = vae.z_dim params["vae_model_type"] = "mlp" if isinstance(vae, MlpVAE) else "cnn" print("") print("Training parameters:") for k, v, in params.items(): print(f" {k}: {v}") print("") # Create state encoding fn measurements_to_include = set(["steer", "throttle", "speed"]) encode_state_fn = create_encode_state_fn(vae, measurements_to_include) # Create env print("Creating environment") env = CarlaEnv(obs_res=(160, 80), action_smoothing=action_smoothing, encode_state_fn=encode_state_fn, reward_fn=reward_functions[reward_fn], synchronous=synchronous, fps=fps, start_carla=start_carla) if isinstance(seed, int): env.seed(seed) best_eval_reward = -float("inf") # Environment constants input_shape = np.array([vae.z_dim + len(measurements_to_include)]) num_actions = env.action_space.shape[0] # Create model print("Creating model") model = PPO(input_shape, env.action_space, learning_rate=learning_rate, lr_decay=lr_decay, epsilon=ppo_epsilon, initial_std=initial_std, value_scale=value_scale, entropy_scale=entropy_scale, model_dir=os.path.join("models", model_name)) # Prompt to load existing model if any if not restart: if os.path.isdir(model.log_dir) and len(os.listdir(model.log_dir)) > 0: answer = input("Model \"{}\" already exists. Do you wish to continue (C) or restart training (R)? ".format(model_name)) if answer.upper() == "C": pass elif answer.upper() == "R": restart = True else: raise Exception("There are already log files for model \"{}\". Please delete it or change model_name and try again".format(model_name)) if restart: shutil.rmtree(model.model_dir) for d in model.dirs: os.makedirs(d) model.init_session() if not restart: model.load_latest_checkpoint() model.write_dict_to_summary("hyperparameters", params, 0) # For every episode while num_episodes <= 0 or model.get_episode_idx() < num_episodes: episode_idx = model.get_episode_idx() # Run evaluation periodically if episode_idx % eval_interval == 0: video_filename = os.path.join(model.video_dir, "episode{}.avi".format(episode_idx)) eval_reward = run_eval(env, model, video_filename=video_filename) model.write_value_to_summary("eval/reward", eval_reward, episode_idx) model.write_value_to_summary("eval/distance_traveled", env.distance_traveled, episode_idx) model.write_value_to_summary("eval/average_speed", 3.6 * env.speed_accum / env.step_count, episode_idx) model.write_value_to_summary("eval/center_lane_deviation", env.center_lane_deviation, episode_idx) model.write_value_to_summary("eval/average_center_lane_deviation", env.center_lane_deviation / env.step_count, episode_idx) model.write_value_to_summary("eval/distance_over_deviation", env.distance_traveled / env.center_lane_deviation, episode_idx) if eval_reward > best_eval_reward: model.save() best_eval_reward = eval_reward # Reset environment state, terminal_state, total_reward = env.reset(), False, 0 # While episode not done print(f"Episode {episode_idx} (Step {model.get_train_step_idx()})") while not terminal_state: states, taken_actions, values, rewards, dones = [], [], [], [], [] for _ in range(horizon): action, value = model.predict(state, write_to_summary=True) # Perform action new_state, reward, terminal_state, info = env.step(action) if info["closed"] == True: exit(0) env.extra_info.extend([ "Episode {}".format(episode_idx), "Training...", "", "Value: % 20.2f" % value ]) env.render() total_reward += reward # Store state, action and reward states.append(state) # [T, *input_shape] taken_actions.append(action) # [T, num_actions] values.append(value) # [T] rewards.append(reward) # [T] dones.append(terminal_state) # [T] state = new_state if terminal_state: break # Calculate last value (bootstrap value) _, last_values = model.predict(state) # [] # Compute GAE advantages = compute_gae(rewards, values, last_values, dones, discount_factor, gae_lambda) returns = advantages + values advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8) # Flatten arrays states = np.array(states) taken_actions = np.array(taken_actions) returns = np.array(returns) advantages = np.array(advantages) T = len(rewards) assert states.shape == (T, *input_shape) assert taken_actions.shape == (T, num_actions) assert returns.shape == (T,) assert advantages.shape == (T,) # Train for some number of epochs model.update_old_policy() # θ_old <- θ for _ in range(num_epochs): num_samples = len(states) indices = np.arange(num_samples) np.random.shuffle(indices) for i in range(int(np.ceil(num_samples / batch_size))): # Sample mini-batch randomly begin = i * batch_size end = begin + batch_size if end > num_samples: end = None mb_idx = indices[begin:end] # Optimize network model.train(states[mb_idx], taken_actions[mb_idx], returns[mb_idx], advantages[mb_idx]) # Write episodic values model.write_value_to_summary("train/reward", total_reward, episode_idx) model.write_value_to_summary("train/distance_traveled", env.distance_traveled, episode_idx) model.write_value_to_summary("train/average_speed", 3.6 * env.speed_accum / env.step_count, episode_idx) model.write_value_to_summary("train/center_lane_deviation", env.center_lane_deviation, episode_idx) model.write_value_to_summary("train/average_center_lane_deviation", env.center_lane_deviation / env.step_count, episode_idx) model.write_value_to_summary("train/distance_over_deviation", env.distance_traveled / env.center_lane_deviation, episode_idx) model.write_episodic_summaries()