def training_evaluation(env, policy): """Evaluates the performance of the current policy in several rollouts. Args: env (gym.Env): The gym environment you want to use. policy (object): The current policy. Returns: [type]: [description] """ # Retrieve action space bounds from env a_upperbound = env.action_space.high a_lowerbound = env.action_space.low # Training setting total_cost = [] death_rates = [] episode_length = [] die_count = 0 seed_average_cost = [] # Perform roolouts to evaluate performance for i in range(TRAIN_PARAMS["num_of_evaluation_paths"]): cost = 0 s = env.reset() for j in range(ENV_PARAMS["max_ep_steps"]): if ENV_PARAMS["eval_render"]: env.render() a = policy.choose_action(s, True) action = a_lowerbound + (a + 1.0) * (a_upperbound - a_lowerbound) / 2 s_, r, done, _ = env.step(action) cost += r if j == ENV_PARAMS["max_ep_steps"] - 1: done = True s = s_ if done: seed_average_cost.append(cost) episode_length.append(j) if j < ENV_PARAMS["max_ep_steps"] - 1: die_count += 1 break # Save evaluation results total_cost.append(np.mean(seed_average_cost)) total_cost_mean = np.average(total_cost) average_length = np.average(episode_length) # Return evaluation results diagnostic = { "return": total_cost_mean, "average_length": average_length, } return diagnostic
def train(log_dir): """Performs the agent traning. Args: log_dir (str): The directory in which the final model (policy) and the log data is saved. """ # Create environment env = get_env_from_name(ENV_NAME) # Set initial learning rates lr_a, lr_l = ( ALG_PARAMS["lr_a"], ALG_PARAMS["lr_l"], ) lr_a_now = ALG_PARAMS["lr_a"] # learning rate for actor, lambda and alpha lr_l_now = ALG_PARAMS["lr_l"] # learning rate for lyapunov critic # Get observation and action space dimension and limits from the environment s_dim = env.observation_space.shape[0] a_dim = env.action_space.shape[0] a_upperbound = env.action_space.high a_lowerbound = env.action_space.low # Create the Lyapunov Actor Critic agent policy = LAC(a_dim, s_dim) pool_params = { "s_dim": s_dim, "a_dim": a_dim, "d_dim": 1, "store_last_n_paths": TRAIN_PARAMS["num_of_training_paths"], "memory_capacity": ALG_PARAMS["memory_capacity"], "min_memory_size": ALG_PARAMS["min_memory_size"], } pool = Pool(pool_params) # Create replay memory buffer # pool = Pool( # s_dim=s_dim, # a_dim=a_dim, # store_last_n_paths=TRAIN_PARAMS["num_of_training_paths"], # memory_capacity=ALG_PARAMS["memory_capacity"], # min_memory_size=ALG_PARAMS["min_memory_size"], # ) # Training setting t1 = time.time() global_step = 0 last_training_paths = deque(maxlen=TRAIN_PARAMS["num_of_training_paths"]) training_started = False # Setup logger and log hyperparameters logger.configure(dir=log_dir, format_strs=["csv"]) logger.logkv("tau", ALG_PARAMS["tau"]) logger.logkv("alpha3", ALG_PARAMS["alpha3"]) logger.logkv("batch_size", ALG_PARAMS["batch_size"]) logger.logkv("target_entropy", policy.target_entropy) # Training loop for i in range(ENV_PARAMS["max_episodes"]): # Create variable to store information about the current path current_path = { "rewards": [], "a_loss": [], "alpha": [], "lambda": [], "lyapunov_error": [], "entropy": [], } # Stop training if max number of steps has been reached if global_step > ENV_PARAMS["max_global_steps"]: break # Reset environment s = env.reset() # Training Episode loop for j in range(ENV_PARAMS["max_ep_steps"]): # Render environment if requested if ENV_PARAMS["eval_render"]: env.render() # Retrieve (scaled) action based on the current policy a = policy.choose_action(s) action = a_lowerbound + (a + 1.0) * (a_upperbound - a_lowerbound) / 2 # Perform action in env s_, r, done, info = env.step(action) # Increment global setp count if training_started: global_step += 1 # Stop episode if max_steps has been reached if j == ENV_PARAMS["max_ep_steps"] - 1: done = True terminal = 1.0 if done else 0.0 # Store experience in replay buffer pool.store(s, a, np.zeros([1]), np.zeros([1]), r, terminal, s_) # Optimzize weights and parameters using STG if (pool.memory_pointer > ALG_PARAMS["min_memory_size"] and global_step % ALG_PARAMS["steps_per_cycle"] == 0): training_started = True # Perform STG a set number of times (train per cycle) for _ in range(ALG_PARAMS["train_per_cycle"]): batch = pool.sample(ALG_PARAMS["batch_size"]) labda, alpha, l_loss, entropy, a_loss = policy.learn( lr_a_now, lr_l_now, lr_a, batch) # Save path results if training_started: current_path["rewards"].append(r) current_path["lyapunov_error"].append(l_loss) current_path["alpha"].append(alpha) current_path["lambda"].append(labda) current_path["entropy"].append(entropy) current_path["a_loss"].append(a_loss) # Evalute the current performance and log results if (training_started and global_step % TRAIN_PARAMS["evaluation_frequency"] == 0 and global_step > 0): logger.logkv("total_timesteps", global_step) training_diagnostics = evaluate_training_rollouts( last_training_paths) if training_diagnostics is not None: if TRAIN_PARAMS["num_of_evaluation_paths"] > 0: eval_diagnostics = training_evaluation(env, policy) [ logger.logkv(key, eval_diagnostics[key]) for key in eval_diagnostics.keys() ] training_diagnostics.pop("return") [ logger.logkv(key, training_diagnostics[key]) for key in training_diagnostics.keys() ] logger.logkv("lr_a", lr_a_now) logger.logkv("lr_l", lr_l_now) string_to_print = ["time_step:", str(global_step), "|"] if TRAIN_PARAMS["num_of_evaluation_paths"] > 0: [ string_to_print.extend( [key, ":", str(eval_diagnostics[key]), "|"]) for key in eval_diagnostics.keys() ] [ string_to_print.extend([ key, ":", str(round(training_diagnostics[key], 2)), "|" ]) for key in training_diagnostics.keys() ] print("".join(string_to_print)) logger.dumpkvs() # Update state s = s_ # Decay learning rate if done: if training_started: last_training_paths.appendleft(current_path) frac = 1.0 - (global_step - 1.0) / ENV_PARAMS["max_global_steps"] lr_a_now = lr_a * frac # learning rate for actor, lambda, alpha lr_l_now = lr_l * frac # learning rate for lyapunov critic break # Save model and print Running time policy.save_result(log_dir) print("Running time: ", time.time() - t1) return