def main(args): if use_gpu: torch.backends.cudnn.deterministic = True print(colored("Using CUDA.", p_color)) torch.cuda.manual_seed(args.seed) torch.manual_seed(args.seed) np.random.seed(args.seed) random.seed(args.seed) test_cpu = False # True to avoid moving gym's state to gpu tensor every step during testing. """ Create environment and get environment's info. """ if args.env_atari: from my_utils.atari_wrappers import Task env = Task(args.env_name, num_envs=1, clip_rewards=False, seed=args.seed) env_test = Task(args.env_name, num_envs=1, clip_rewards=False, seed=args.seed) elif args.env_bullet: import pybullet import pybullet_envs pybullet.connect(pybullet.DIRECT) env = gym.make(args.env_name) env.seed(args.seed) env_test = env if args.render: env_test.render(mode="human") else: env = gym.make(args.env_name) env_test = gym.make(args.env_name) env.seed(args.seed) env_test.seed(args.seed) state_dim = env.observation_space.shape[0] is_disc_action = args.env_discrete action_dim = (0 if is_disc_action else env.action_space.shape[0]) if is_disc_action: a_bound = 1 action_num = env.action_space.n print("State dim: %d, action num: %d" % (state_dim, action_num)) else: """ always normalize env. """ from my_utils.my_gym_utils import NormalizeGymWrapper env = NormalizeGymWrapper(env) env_test = NormalizeGymWrapper(env_test) a_bound = np.asscalar(env.action_space.high[0]) a_low = np.asscalar(env.action_space.low[0]) assert a_bound == -a_low print("State dim: %d, action dim: %d, action bound %d" % (state_dim, action_dim, a_bound)) """ Set method and hyper parameter in file name""" method_name = args.rl_method.upper() hypers = rl_hypers_parser(args) exp_name = "%s-%s_s%d" % (method_name, hypers, args.seed) """ Set path for result and model files """ result_path = "./RL_results/%s/%s/%s-%s" % (method_name, args.env_name, args.env_name, exp_name) model_path = "./RL_results/%s_models/%s/%s-%s" % (args.rl_method.upper(), args.env_name, args.env_name, exp_name) pathlib.Path("./RL_results/%s/%s" % (method_name, args.env_name)).mkdir(parents=True, exist_ok=True) if platform.system() != "Windows": pathlib.Path("./RL_results/%s_models/%s" % (method_name, args.env_name)).mkdir(parents=True, exist_ok=True) print("Running %s" % (colored(method_name, p_color))) print("%s result will be saved at %s" % (colored(method_name, p_color), colored(result_path, p_color))) """define actor and critic""" if is_disc_action: if args.rl_method == "dqn": policy_updater = DQN(state_dim=state_dim, action_num=action_num, args=args, double_q=False, cnn=args.cnn) if args.rl_method == "ddqn": policy_updater = DQN(state_dim=state_dim, action_num=action_num, args=args, double_q=True, cnn=args.cnn) if args.rl_method == "qr_dqn": policy_updater = QR_DQN(state_dim=state_dim, action_num=action_num, args=args, cnn=args.cnn) if args.rl_method == "clipped_ddqn": policy_updater = Clipped_DDQN(state_dim=state_dim, action_num=action_num, args=args, cnn=args.cnn) if args.rl_method == "ppo": policy_updater = PPO(state_dim=state_dim, action_dim=action_dim, args=args, a_bound=action_num, is_discrete=True, cnn=args.cnn) else: if args.rl_method == "ac": policy_updater = AC(state_dim=state_dim, action_dim=action_dim, args=args, a_bound=a_bound) if args.rl_method == "sac": policy_updater = SAC(state_dim=state_dim, action_dim=action_dim, args=args, a_bound=a_bound) if args.rl_method == "td3": policy_updater = TD3(state_dim=state_dim, action_dim=action_dim, args=args, a_bound=a_bound) if args.rl_method == "trpo": policy_updater = TRPO(state_dim=state_dim, action_dim=action_dim, args=args, a_bound=a_bound) if args.rl_method == "ppo": policy_updater = PPO(state_dim=state_dim, action_dim=action_dim, args=args, a_bound=a_bound) update_type = policy_updater.update_type # "on_policy" or "off_policy" if args.max_step is None: if update_type == "on_policy": args.max_step = 5000000 elif update_type == "off_policy": args.max_step = 1000000 if args.env_atari: args.max_step = args.max_step * 10 """ Function to update the parameters of value and policy networks""" def update_params_g(batch): states = torch.FloatTensor(np.stack(batch.state)).to(device) next_states = torch.FloatTensor(np.stack(batch.next_state)).to(device) masks = torch.FloatTensor(np.stack(batch.mask)).to(device).unsqueeze(-1) rewards = torch.FloatTensor(np.stack(batch.reward)).to(device).unsqueeze(-1) actions = torch.LongTensor(np.stack(batch.action)) if is_disc_action else torch.FloatTensor(np.stack(batch.action)) policy_updater.update_policy(states, actions.to(device), next_states, rewards, masks) """ Storage and counters """ memory = Memory(capacity=1000000) # Memory buffer with 1 million max size. step, i_iter, tt_g = 0, 0, 0 perform_test = 0 log_interval = args.max_step // 1000 # 1000 lines in the text files save_model_interval = (log_interval * 10) * (platform.system() != "Windows") # do not save on my windows laptop print("Max steps: %s, Log interval: %s steps, Model interval: %s steps" % \ (colored(args.max_step, p_color), colored(log_interval, p_color), colored(save_model_interval, p_color))) """ Reset seed again """ if use_gpu: torch.cuda.manual_seed(args.seed) torch.manual_seed(args.seed) np.random.seed(args.seed) random.seed(args.seed) """ Agent for testing in a separated environemnt """ agent_test = Agent(env_test, render=args.render, t_max=args.t_max, test_cpu=test_cpu) if args.env_bullet: log_test = agent_test.collect_samples_test(policy=policy_updater, max_num_episodes=10) state = env.reset() """ The actual learning loop""" for total_step in range(0, args.max_step + 1): """ Save the learned policy model """ if save_model_interval > 0 and total_step % save_model_interval == 0: policy_updater.save_model("%s_policy_T%d.pt" % (model_path, total_step)) """ Test the policy before update """ if total_step % log_interval == 0: perform_test = 1 if perform_test: if args.env_bullet: if done: log_test = agent_test.collect_samples_test(policy=policy_updater, max_num_episodes=10) perform_test = 0 else: log_test = agent_test.collect_samples_test(policy=policy_updater, max_num_episodes=10) perform_test = 0 """ take env step """ if total_step <= args.random_action and update_type == "off_policy": action = env.action_space.sample() else: action = policy_updater.sample_action(torch.FloatTensor(state).to(device).unsqueeze(0)).to(device_cpu).detach().numpy() next_state, reward, done, _ = env.step(action) if step + 1 == args.t_max: done = 1 memory.push(state, action, int(not done), next_state, reward, 0) state = next_state step = step + 1 """ reset env """ if done : # reset state = env.reset() step = 0 """ Update policy """ if update_type == "on_policy": if memory.size() >= args.big_batch_size and done : t0_g = time.time() batch = memory.sample() update_params_g(batch=batch) memory.reset() tt_g += time.time() - t0_g elif update_type == "off_policy": if total_step >= args.big_batch_size: t0_g = time.time() batch = memory.sample(args.mini_batch_size) update_params_g(batch=batch) tt_g += time.time() - t0_g """ Print out result to stdout and save it to a text file for plotting """ if total_step % log_interval == 0: result_text = t_format("Step %7d " % (total_step), 0) \ + t_format("(g%2.2f)s" % (tt_g), 1) result_text += " | [R_te] " + t_format("min: %.2f" % log_test['min_reward'], 1) + t_format("max: %.2f" % log_test['max_reward'], 1) \ + t_format("Avg: %.2f (%.2f)" % (log_test['avg_reward'], log_test['std_reward']), 2) if (args.rl_method == "sac" or args.rl_method == "vac"): result_text += ("| ent %0.3f" % (policy_updater.entropy_coef)) tt_g = 0 print(result_text) with open(result_path + ".txt", 'a') as f: print(result_text, file=f)
def main(args): if args.il_method is None: method_type = "RL" # means we just do RL with environment's rewards info_method = False encode_dim = 0 else: method_type = "IL" if "info" in args.il_method: info_method = True encode_dim = args.encode_dim else: info_method = False encode_dim = 0 torch.manual_seed(args.seed) if use_gpu: torch.cuda.manual_seed_all(args.seed) torch.backends.cudnn.deterministic = True print(colored("Using CUDA.", p_color)) np.random.seed(args.seed) random.seed(args.seed) test_cpu = True # Set to True to avoid moving gym's state to gpu tensor every step during testing. env_name = args.env_name """ Create environment and get environment's info. """ if args.env_atari: from my_utils.atari_wrappers import Task env = Task(env_name, num_envs=1, clip_rewards=False, seed=args.seed) env_test = Task(env_name, num_envs=1, clip_rewards=False, seed=args.seed) elif args.env_bullet: import pybullet import pybullet_envs pybullet.connect(pybullet.DIRECT) env = gym.make(env_name) env.seed(args.seed) env_test = env if args.render: env_test.render(mode="human") elif args.env_robosuite: from my_utils.my_robosuite_utils import make_robosuite_env args.t_max = 500 env = make_robosuite_env(args) env_test = make_robosuite_env(args) # the sampler use functions from python's random, so the seed are already set. env_name = args.env_name + "_reach" else: env = gym.make(env_name) env.seed(args.seed) env_test = gym.make(env_name) env_test.seed(args.seed) state_dim = env.observation_space.shape[0] is_disc_action = args.env_discrete action_dim = (0 if is_disc_action else env.action_space.shape[0]) if args.env_robosuite: action_dim = action_dim - 1 # we disable gripper for reaching if is_disc_action: a_bound = 1 action_num = env.action_space.n print("State dim: %d, action num: %d" % (state_dim, action_num)) else: """ always normalize env. """ if np.asscalar(env.action_space.high[0]) != 1: from my_utils.my_gym_utils import NormalizeGymWrapper env = NormalizeGymWrapper(env) env_test = NormalizeGymWrapper(env_test) print("Use state-normalized environments.") a_bound = np.asscalar(env.action_space.high[0]) a_low = np.asscalar(env.action_space.low[0]) assert a_bound == -a_low assert a_bound == 1 print("State dim: %d, action dim: %d, action bound %d" % (state_dim, action_dim, a_bound)) if "LunarLanderContinuous" in env_name or "BipedalWalker" in env_name: from my_utils.my_gym_utils import ClipGymWrapper env = ClipGymWrapper(env) env_test = ClipGymWrapper(env_test) torch.manual_seed(args.seed) np.random.seed(args.seed) random.seed(args.seed) """define actor and critic""" if is_disc_action: # work in progress... if args.rl_method == "dqn": policy_updater = DQN(state_dim=state_dim, action_num=action_num, args=args, double_q=False, encode_dim=encode_dim) if args.rl_method == "ddqn": policy_updater = DQN(state_dim=state_dim, action_num=action_num, args=args, double_q=True, encode_dim=encode_dim) if args.rl_method == "qr_dqn": policy_updater = QR_DQN(state_dim=state_dim, action_num=action_num, args=args, encode_dim=encode_dim) if args.rl_method == "clipped_ddqn": policy_updater = Clipped_DDQN(state_dim=state_dim, action_num=action_num, args=args, encode_dim=encode_dim) if args.rl_method == "ppo": policy_updater = PPO(state_dim=state_dim, action_dim=action_dim, args=args, a_bound=action_num, is_discrete=True, encode_dim=encode_dim) else: if args.rl_method == "ac": policy_updater = AC(state_dim=state_dim, action_dim=action_dim, args=args, a_bound=a_bound, encode_dim=encode_dim) if args.rl_method == "sac": policy_updater = SAC(state_dim=state_dim, action_dim=action_dim, args=args, a_bound=a_bound, encode_dim=encode_dim) if args.rl_method == "td3": policy_updater = TD3(state_dim=state_dim, action_dim=action_dim, args=args, a_bound=a_bound, encode_dim=encode_dim) if args.rl_method == "trpo": policy_updater = TRPO(state_dim=state_dim, action_dim=action_dim, args=args, a_bound=a_bound, encode_dim=encode_dim) if args.rl_method == "ppo": policy_updater = PPO(state_dim=state_dim, action_dim=action_dim, args=args, a_bound=a_bound, encode_dim=encode_dim) update_type = policy_updater.update_type # "on_policy" or "off_policy" if args.max_step is None: if update_type == "on_policy": args.max_step = 5000000 if args.psi_param_std is None: args.psi_param_std = 0 elif update_type == "off_policy": args.max_step = 1000000 if args.psi_param_std is None: args.psi_param_std = 1 if args.env_atari: args.max_step = args.max_step * 10 if method_type == "IL": if args.il_method == "irl": # maximum entropy IRL discriminator_updater = IRL(state_dim=state_dim, action_dim=action_dim, args=args) elif args.il_method == "gail": discriminator_updater = GAIL(state_dim=state_dim, action_dim=action_dim, args=args) elif args.il_method == "vail": discriminator_updater = VAIL(state_dim=state_dim, action_dim=action_dim, args=args) elif args.il_method == "airl": discriminator_updater = AIRL(state_dim=state_dim, action_dim=action_dim, args=args, policy_updater=policy_updater) # need entropy coefficient and policy elif args.il_method == "vild": discriminator_updater = VILD(state_dim=state_dim, action_dim=action_dim, args=args, policy_updater=policy_updater) # need entropy coefficient elif args.il_method == "infogail": discriminator_updater = InfoGAIL(state_dim=state_dim, action_dim=action_dim, args=args, policy_updater=policy_updater) # AIRL version need entropy coefficent and policy # pretrain pi for robosuite env. if args.env_robosuite : discriminator_updater.behavior_cloning(policy_net=policy_updater.policy_net, learning_rate=args.learning_rate_pv, bc_step=args.bc_step) # pretrain pi elif args.il_method == "vild": # pretrain only q_psi discriminator_updater.behavior_cloning(policy_net=None, learning_rate=args.learning_rate_pv, bc_step=args.bc_step) """ Set method and hyper parameter in file name""" if method_type == "RL": method_name = args.rl_method.upper() hypers = rl_hypers_parser(args) else: method_name = args.il_method.upper() + "_" + args.rl_method.upper() hypers = rl_hypers_parser(args) + "_" + irl_hypers_parser(args) if args.il_method == "vild" and args.vild_loss_type.lower() != "linear": method_name += "_" + args.vild_loss_type.upper() if args.il_method == "infogail" and args.info_loss_type.lower() != "bce": method_name += "_" + args.info_loss_type.upper() if method_type == "RL": exp_name = "%s-%s_s%d" % (method_name, hypers, args.seed) elif method_type == "IL": exp_name = "%s-%s-%s_s%d" % (discriminator_updater.traj_name, method_name, hypers, args.seed) """ Set path for result and model files """ result_path = "./results_%s/%s/%s/%s-%s" % (method_type, method_name, env_name, env_name, exp_name) model_path = "./results_%s/%s_models/%s/%s-%s" % (method_type, method_name, env_name, env_name, exp_name) pathlib.Path("./results_%s/%s/%s" % (method_type, method_name, env_name)).mkdir(parents=True, exist_ok=True) # if platform.system() != "Windows": pathlib.Path("./results_%s/%s_models/%s" % (method_type, method_name, env_name)).mkdir(parents=True, exist_ok=True) print("Running %s" % (colored(method_name, p_color))) print("%s result will be saved at %s" % (colored(method_name, p_color), colored(result_path, p_color))) """ Function to update the parameters of value and policy networks""" def update_params_g(batch): states = torch.FloatTensor(np.stack(batch.state)).to(device) next_states = torch.FloatTensor(np.stack(batch.next_state)).to(device) masks = torch.FloatTensor(np.stack(batch.mask)).to(device).unsqueeze(-1) actions = torch.LongTensor(np.stack(batch.action)).to(device) if is_disc_action else torch.FloatTensor(np.stack(batch.action)).to(device) if method_type == "RL": rewards = torch.FloatTensor(np.stack(batch.reward)).to(device).unsqueeze(-1) policy_updater.update_policy(states, actions.to(device), next_states, rewards, masks) elif method_type == "IL": nonlocal d_rewards d_rewards = discriminator_updater.compute_reward(states, actions).detach().data # Append one-hot vector of context to state. if info_method: latent_codes = torch.LongTensor(np.stack(batch.latent_code)).to(device).view(-1,1) # [batch_size, 1] d_rewards += discriminator_updater.compute_posterior_reward(states, actions, latent_codes).detach().data latent_codes_onehot = torch.FloatTensor(states.size(0), encode_dim).to(device) latent_codes_onehot.zero_() latent_codes_onehot.scatter_(1, latent_codes, 1) #should have size [batch_size, num_worker] states = torch.cat((states, latent_codes_onehot), 1) next_states = torch.cat((next_states, latent_codes_onehot), 1) policy_updater.update_policy(states, actions, next_states, d_rewards, masks) """ Storage and counters """ memory = Memory(capacity=1000000) # Memory buffer with 1 million max size. step, i_iter, tt_g, tt_d, perform_test = 0, 0, 0, 0, 0 d_rewards = torch.FloatTensor(1).fill_(0) ## placeholder log_interval = args.max_step // 1000 # 1000 lines in the text files if args.env_robosuite: log_interval = args.max_step // 500 # reduce to 500 lines to save experiment time save_model_interval = (log_interval * 10) # * (platform.system() != "Windows") # do not save model ? print("Max steps: %s, Log interval: %s steps, Model interval: %s steps" % \ (colored(args.max_step, p_color), colored(log_interval, p_color), colored(save_model_interval, p_color))) """ Reset seed again """ torch.manual_seed(args.seed) np.random.seed(args.seed) random.seed(args.seed) """ Agent for testing in a separated environemnt """ agent_test = Agent(env_test, render=args.render, t_max=args.t_max, test_cpu=test_cpu) if args.env_bullet: log_test = agent_test.collect_samples_test(policy=policy_updater, max_num_episodes=10) latent_code = None ## only for infogail state = env.reset() done = 1 """ The actual learning loop""" for total_step in range(0, args.max_step + 1): """ Save the learned policy model """ if save_model_interval > 0 and total_step % save_model_interval == 0: policy_updater.save_model("%s_policy_T%d.pt" % (model_path, total_step)) """ Test the policy before update """ if total_step % log_interval == 0: perform_test = 1 """ Test learned policy """ if perform_test: if not info_method: if not args.env_bullet: log_test = agent_test.collect_samples_test(policy=policy_updater, max_num_episodes=10) perform_test = 0 elif done: # Because env and env_test are the same object for pybullet. log_test = agent_test.collect_samples_test(policy=policy_updater, max_num_episodes=10) perform_test = 0 else: log_test = [] for i_k in range(0, encode_dim): # latent_code_test = discriminator_updater.sample_code().fill_(i_k) # legacy code that change rng sequences. Use this line to reproduce old results. latent_code_test = torch.LongTensor(size=(1,1)).fill_(i_k) latent_code_onehot_test = torch.FloatTensor(1, encode_dim) latent_code_onehot_test.zero_() latent_code_onehot_test.scatter_(1, latent_code_test, 1) log_test += [agent_test.collect_samples_test(policy=policy_updater, max_num_episodes=10, latent_code_onehot=latent_code_onehot_test.squeeze() )] # use 1 instead of 10 to save time? perform_test = 0 if info_method and latent_code is None: latent_code = discriminator_updater.sample_code() #sample scalar latent code from the prior p(c) which is uniform. latent_code_onehot = torch.FloatTensor(1, encode_dim) latent_code_onehot.zero_() latent_code_onehot.scatter_(1, latent_code, 1) latent_code_onehot = latent_code_onehot.squeeze() #should have size [encode_dim] latent_code = latent_code.detach().numpy() state_var = torch.FloatTensor(state) if latent_code is not None: state_var = torch.cat((state_var, latent_code_onehot), 0) """ take env step """ if total_step <= args.random_action and update_type == "off_policy": # collect random actions first for off policy methods action = env.action_space.sample() else: action = policy_updater.sample_action(state_var.to(device).unsqueeze(0)).to(device_cpu).detach().numpy() if args.il_method == "vild": # Add noise from Sigma_k to action (noise_t = sqrt(Sigma_k)) action_u = action + args.noise_t * np.random.normal( np.zeros(action.shape), np.ones(action.shape) ) next_state, reward, done, _ = env.step(action_u) else: next_state, reward, done, _ = env.step(action) if step + 1 == args.t_max: done = 1 memory.push(state, action, int(not done), next_state, reward, latent_code) state = next_state step = step + 1 """ reset env """ if done : # reset state = env.reset() step = 0 latent_code = None """ Update policy """ if update_type == "on_policy": if memory.size() >= args.big_batch_size and done : batch = memory.sample() if method_type == "IL": for i_d in range(0, args.d_step): index = discriminator_updater.index_sampler() # should be inside update_discriminator for cleaner code... t0_d = time.time() discriminator_updater.update_discriminator(batch=batch, index=index, total_step=total_step) tt_d += time.time() - t0_d t0_g = time.time() update_params_g(batch=batch) tt_g += time.time() - t0_g memory.reset() elif update_type == "off_policy": if total_step >= args.big_batch_size: if method_type == "IL": index = discriminator_updater.index_sampler() batch = memory.sample(args.mini_batch_size) t0_d = time.time() discriminator_updater.update_discriminator(batch=batch, index=index, total_step=total_step) tt_d += time.time() - t0_d elif method_type == "RL": batch = memory.sample(args.mini_batch_size) t0_g = time.time() update_params_g(batch=batch) tt_g += time.time() - t0_g """ Print out result to stdout and save it to a text file for plotting """ if total_step % log_interval == 0: result_text = t_format("Step %7d " % (total_step), 0) if method_type == "RL": result_text += t_format("(g%2.2f)s" % (tt_g), 1) elif method_type == "IL": c_reward_list = d_rewards.to(device_cpu).detach().numpy() result_text += t_format("(g%2.1f+d%2.1f)s" % (tt_g, tt_d), 1) result_text += " | [D] " + t_format("min: %.2f" % np.amin(c_reward_list), 0.5) + t_format(" max: %.2f" % np.amax(c_reward_list), 0.5) result_text += " | [R_te] " if not info_method: result_text += t_format("min: %.2f" % log_test['min_reward'], 1) + t_format("max: %.2f" % log_test['max_reward'], 1) \ + t_format("Avg: %.2f (%.2f)" % (log_test['avg_reward'], log_test['std_reward']), 2) else: result_text += "Avg " for i_k in range(0, encode_dim): result_text += t_format("%d: %.2f (%.2f)" % (i_k, log_test[i_k]['avg_reward'], log_test[i_k]['std_reward']), 2) if (args.rl_method == "sac"): result_text += ("| ent %0.3f" % (policy_updater.entropy_coef)) if args.il_method == "vild": ## check estimated worker noise estimated_worker_noise = discriminator_updater.worker_net.get_worker_cov().to(device_cpu).detach().numpy().squeeze() if action_dim > 1: estimated_worker_noise = estimated_worker_noise.mean(axis=0) #average across action dim result_text += " | w_noise: %s" % (np.array2string(estimated_worker_noise, formatter={'float_kind':lambda x: "%.5f" % x}).replace('\n', '') ) tt_g = 0 tt_d = 0 print(result_text) with open(result_path + ".txt", 'a') as f: print(result_text, file=f)
def learn_model(args): print("RL result will be saved at %s" % args.rl_filename) print("RL model will be saved at %s" % args.rl_model_filename) if use_gpu: print("Using CUDA.") torch.manual_seed(args.rl_seed) if use_gpu: torch.cuda.manual_seed_all(args.rl_seed) torch.backends.cudnn.deterministic = True np.random.seed(args.rl_seed) random.seed(args.rl_seed) env = gym.make(args.env_name) env.seed(args.rl_seed) env_test = gym.make(args.env_name) env_test.seed(args.rl_seed) state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] a_bound = np.asscalar(env.action_space.high[0]) a_low = np.asscalar(env.action_space.low[0]) assert a_bound == -a_low ## Binary flag for manually cliping actions for step function after adding Gaussian noise. clip = (args.env_name == "LunarLanderContinuous-v2" or args.env_name == "BipedalWalker-v2") print(env.observation_space) print(env.action_space) """define actor and critic""" policy_net = Policy(state_dim, action_dim, log_std=args.log_std, a_bound=a_bound, hidden_size=args.hidden_size, activation=args.activation).to(device) value_net = Value(state_dim, hidden_size=args.hidden_size, activation=args.activation).to(device) optimizer_value = torch.optim.Adam(value_net.parameters(), lr=args.learning_rate_v) decayed_lambda_td = args.lambda_td def update_params_c(batch, i_iter): states = torch.from_numpy(np.stack(batch.state)).float().to(device) actions = torch.from_numpy(np.stack(batch.action)).float().to(device) rewards = torch.from_numpy(np.stack(batch.reward)).float().to(device) masks = torch.from_numpy(np.stack(batch.mask).astype( np.float32)).to(device) """get advantage estimation from the trajectories""" values = value_net(states).data advantages, lambda_returns, mc_returns = estimate_advantages( rewards, masks, values, args.gamma, args.tau) if args.lamret: returns = lambda_returns else: returns = mc_returns """perform critic update""" #gae_step(value_net, optimizer_value, states, lambda_returns, args.l2_reg) # full batch GD gae_step_epoch(value_net, optimizer_value, states, returns, args.l2_reg) # Stochastic GD """ Function to update the parameters of value and policy networks""" def update_params_p(batch, i_iter): nonlocal decayed_lambda_td states = torch.from_numpy(np.stack(batch.state)).float().to(device) actions = torch.from_numpy(np.stack(batch.action)).float().to(device) next_states = torch.from_numpy(np.stack( batch.next_state)).float().to(device) rewards = torch.from_numpy(np.stack(batch.reward)).float().to(device) masks = torch.from_numpy(np.stack(batch.mask).astype( np.float32)).to(device) """get advantage estimation from the trajectories, this is done after gae_step update""" values = value_net(states).data advantages, lambda_returns, mc_returns = estimate_advantages( rewards, masks, values, gamma=args.gamma, tau=args.tau) if args.method_name == "TRPO-RET-MC": returns = mc_returns.detach( ) # detach() does not matter since we back prop policy network only. elif args.method_name == "TRPO-RET-GAE": returns = lambda_returns.detach( ) # detach() does not matter actually. else: returns = 0 # returns is not used for TRPO and TRPO-TD. # standardize or not ? if args.mgae: advantages = (advantages - advantages.mean() ) / advantages.std() # this will be m-std version else: advantages = advantages / advantages.std( ) # this will be std version trpo_step_td(policy_net=policy_net, value_net=value_net, states=states, actions=actions, next_states=next_states, rewards=rewards, masks=masks, gamma=args.gamma, advantages=advantages, \ max_kl=args.max_kl, damping=args.damping, \ lambda_td=decayed_lambda_td, method_name=args.method_name, returns=returns, mtd=args.mtd) """ decay the td_reg parameter after update """ decayed_lambda_td = decayed_lambda_td * args.decay_td """create agent""" agent = Agent(env, policy_net, render=False) agent_test = Agent(env_test, policy_net, mean_action=True, render=args.render) """ The actual learning loop""" for i_iter in range(args.rl_max_iter_num): """ Save the learned policy model """ if ( (i_iter) % args.rl_save_model_interval == 0 and args.rl_save_model_interval > 0 ) \ or (i_iter == args.rl_max_iter_num + 1) or i_iter == 0: policy_net = policy_net.to(device_cpu) value_net = value_net.to(device_cpu) pickle.dump((policy_net, value_net), open(args.rl_model_filename + ("_I%d.p" % (i_iter)), 'wb')) policy_net = policy_net.to(device) value_net = value_net.to(device) """ Test the policy before update """ if i_iter % args.log_interval == 0 or i_iter + 1 == args.rl_max_iter_num: _, log_test = agent_test.collect_samples_test(max_num_episodes=20, render=args.render, clip=clip) """generate multiple trajectories that reach the minimum batch_size""" t0 = time.time() batch, log = agent.collect_samples_train( args.min_batch_size, render=False, clip=clip) # this is on-policy samples t1 = time.time() """ update parameters """ t0_d = time.time() update_params_c(batch, i_iter) #critic update update_params_p(batch, i_iter) #actor update t1_d = time.time() """ Print out result to stdout and save it to a text file for later usage""" if i_iter % args.log_interval == 0: result_text = t_format("Iter %6d (%2.2fs)+(%2.2fs)" % (i_iter, t1 - t0, t1_d - t0_d)) result_text += " | [R] " + t_format( "Avg: %.2f (%.2f)" % (log['avg_reward'], log['std_reward']), 2) result_text += " | [R_test] " + t_format("min: %.2f" % log_test['min_reward'], 1) + t_format("max: %.2f" % log_test['max_reward'], 1) \ + t_format("Avg: %.2f (%.2f)" % (log_test['avg_reward'], log_test['std_reward']), 2) print(result_text) with open(args.rl_filename, 'a') as f: print(result_text, file=f)
def main(args): if args.il_method is None: raise NotImplementedError else: method_type = "IL" torch.manual_seed(args.seed) if use_gpu: torch.cuda.manual_seed_all(args.seed) torch.backends.cudnn.deterministic = True print(colored("Using CUDA.", p_color)) np.random.seed(args.seed) random.seed(args.seed) test_cpu = True # True to avoid moving gym's state to gpu tensor every step during testing. env_name = args.env_name """ Create environment and get environment's info. """ if args.env_atari: from my_utils.atari_wrappers import Task env = Task(env_name, num_envs=1, clip_rewards=False, seed=args.seed) env_test = env elif args.env_bullet: import pybullet import pybullet_envs pybullet.connect(pybullet.DIRECT) env = gym.make(env_name) env.seed(args.seed) env_test = env if args.render: env_test.render(mode="human") elif args.env_robosuite: from my_utils.my_robosuite_utils import make_robosuite_env args.t_max = 500 env = make_robosuite_env(args) env_test = env # the sampler use functions from python's random, so the seed are already set. env_name = args.env_name + "_reach" else: env = gym.make(env_name) env.seed(args.seed) env_test = env state_dim = env.observation_space.shape[0] is_disc_action = args.env_discrete action_dim = (0 if is_disc_action else env.action_space.shape[0]) if args.env_robosuite: action_dim = action_dim - 1 # we disable gripper for reaching if is_disc_action: a_bound = 1 action_num = env.action_space.n print("State dim: %d, action num: %d" % (state_dim, action_num)) else: """ always normalize env. """ if np.asscalar(env.action_space.high[0]) != 1: from my_utils.my_gym_utils import NormalizeGymWrapper env = NormalizeGymWrapper(env) env_test = NormalizeGymWrapper(env_test) print("Use state-normalized environments.") a_bound = np.asscalar(env.action_space.high[0]) a_low = np.asscalar(env.action_space.low[0]) assert a_bound == -a_low assert a_bound == 1 print("State dim: %d, action dim: %d, action bound %d" % (state_dim, action_dim, a_bound)) if "LunarLanderContinuous" in env_name or "BipedalWalker" in env_name: from my_utils.my_gym_utils import ClipGymWrapper env = ClipGymWrapper(env) env_test = ClipGymWrapper(env_test) torch.manual_seed(args.seed) np.random.seed(args.seed) random.seed(args.seed) args.max_step = 1000000 if args.il_method == "bc": policy_updater = BC(state_dim=state_dim, action_dim=action_dim, args=args, a_bound=a_bound) elif args.il_method == "dbc": policy_updater = DBC(state_dim=state_dim, action_dim=action_dim, args=args, a_bound=a_bound) elif args.il_method == "cobc": policy_updater = COBC(state_dim=state_dim, action_dim=action_dim, args=args, a_bound=a_bound) discriminator_updater = policy_updater update_type = policy_updater.update_type # "off_policy" """ Set method and hyper parameter in file name""" method_name = args.il_method.upper() hypers = bc_hypers_parser(args) exp_name = "%s-%s-%s_s%d" % (discriminator_updater.traj_name, method_name, hypers, args.seed) """ Set path for result and model files """ result_path = "./results_%s/%s/%s/%s-%s" % (method_type, method_name, env_name, env_name, exp_name) model_path = "./results_%s/%s_models/%s/%s-%s" % (method_type, method_name, env_name, env_name, exp_name) pathlib.Path("./results_%s/%s/%s" % (method_type, method_name, env_name)).mkdir(parents=True, exist_ok=True) # if platform.system() != "Windows": pathlib.Path("./results_%s/%s_models/%s" % (method_type, method_name, env_name)).mkdir(parents=True, exist_ok=True) print("Running %s" % (colored(method_name, p_color))) print("%s result will be saved at %s" % (colored(method_name, p_color), colored(result_path, p_color))) """ Storage and counters """ step, i_iter, tt_g, tt_d, perform_test = 0, 0, 0, 0, 0 log_interval = args.max_step // 1000 # 1000 lines in the text files if args.env_robosuite: log_interval = args.max_step // 500 # reduce to 500 lines to save experiment time save_model_interval = (log_interval * 10) # * (platform.system() != "Windows") # do not save on my windows laptop print("Max steps: %s, Log interval: %s steps, Model interval: %s steps" % \ (colored(args.max_step, p_color), colored(log_interval, p_color), colored(save_model_interval, p_color))) # """ Reset seed again """ # torch.manual_seed(args.seed) # np.random.seed(args.seed) # random.seed(args.seed) """ Agent for testing in a separated environemnt """ agent_test = Agent(env_test, render=args.render, t_max=args.t_max, test_cpu=test_cpu) if args.env_bullet: log_test = agent_test.collect_samples_test(policy=policy_updater, max_num_episodes=10) latent_code = None ## only for infogail # state = env.reset() """ The actual learning loop""" for total_step in range(0, args.max_step + 1): """ Save the learned policy model """ if save_model_interval > 0 and total_step % save_model_interval == 0: policy_updater.save_model("%s_policy_T%d.pt" % (model_path, total_step)) """ Test the policy before update """ if total_step % log_interval == 0: perform_test = 1 """ Test learned policy """ if perform_test: log_test = agent_test.collect_samples_test(policy=policy_updater, max_num_episodes=10) train_acc = policy_updater.evaluate_train_accuray() perform_test = 0 """ Update policy """ t0_g = time.time() policy_updater.update_policy(total_step) tt_g += time.time() - t0_g """ Print out result to stdout and save it to a text file for plotting """ if total_step % log_interval == 0: result_text = t_format("Step %7d " % (total_step), 0) result_text += t_format("(bc%2.1f)s" % (tt_g), 0) result_text += " | [R_te] " result_text += t_format("min: %.2f" % log_test['min_reward'], 1) + t_format("max: %.2f" % log_test['max_reward'], 1) \ + t_format("Avg: %.2f (%.2f)" % (log_test['avg_reward'], log_test['std_reward']), 2) result_text += " | [MSE_tr] " + t_format(" %.4f" % (train_acc), 0) if args.il_method == "dbc": ## check estimated worker noise estimated_worker_noise = policy_updater.worker_net.get_worker_cov().to(device_cpu).detach().numpy().squeeze() if action_dim > 1: estimated_worker_noise = estimated_worker_noise.mean(axis=0) #average across action dim result_text += " | w_noise: %s" % (np.array2string(estimated_worker_noise, formatter={'float_kind':lambda x: "%.3f" % x}).replace('\n', '') ) tt_g = 0 print(result_text) with open(result_path + ".txt", 'a') as f: print(result_text, file=f)