def bcq_learn(env_set="Hopper-v2", seed=0, buffer_type="FinalSigma0.5", buffer_seed=0, buffer_size='1000K', cut_buffer_size='1000K', eval_freq=float(1e3), max_timesteps=float(1e6), lr=1e-3, logger_kwargs=dict()): device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") print("running on device:", device) """set up logger""" global logger logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) file_name = "BCQ_%s_%s" % (env_set, seed) buffer_name = "%s_%s_%s" % (buffer_type, env_set, buffer_seed) print ("---------------------------------------") print ("Task: " + file_name) print ("---------------------------------------") if not os.path.exists("./results"): os.makedirs("./results") env = gym.make(env_set) test_env = gym.make(env_set) # Set seeds env.seed(seed) test_env.seed(seed) env.action_space.np_random.seed(seed) test_env.action_space.np_random.seed(seed) torch.manual_seed(seed) np.random.seed(seed) state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] max_action = float(env.action_space.high[0]) # Initialize policy policy = BCQ_bl.BCQ(state_dim, action_dim, max_action, lr=lr) # Load buffer replay_buffer = utils.ReplayBuffer() replay_buffer.load(buffer_name + '_' + buffer_size) if buffer_size != cut_buffer_size: replay_buffer.cut_final(int(cut_buffer_size[:-1]) * 1e3) print(replay_buffer.get_length()) print('buffer setting:', buffer_name + '_' + cut_buffer_size) episode_num = 0 done = True training_iters, epoch = 0, 0 while training_iters < max_timesteps: epoch += 1 pol_vals = policy.train(replay_buffer, iterations=int(eval_freq), logger=logger) avgtest_reward = evaluate_policy(policy, test_env) training_iters += eval_freq logger.log_tabular('Epoch', epoch) logger.log_tabular('AverageTestEpRet', avgtest_reward) logger.log_tabular('TotalSteps', training_iters) logger.log_tabular('QLoss', average_only=True) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) logger.log_tabular('ActLoss', with_min_and_max=True) logger.dump_tabular()
os.makedirs("./pytorch_models") env = gym.make(args.env_name) # Set seeds env.seed(args.seed) torch.manual_seed(args.seed) np.random.seed(args.seed) state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] max_action = float(env.action_space.high[0]) # Initialize policy and buffer policy = DDPG.DDPG(state_dim, action_dim, max_action) replay_buffer = utils.ReplayBuffer() total_timesteps = 0 episode_num = 0 done = True while total_timesteps < args.max_timesteps: if done: if total_timesteps != 0: print(total_timesteps, episode_num, episode_timesteps, episode_reward) print("Total T: %d Episode Num: %d Episode T: %d Reward: %f" % (total_timesteps, episode_num, episode_timesteps, episode_reward))
def ue_train(env_set="Hopper-v2", seed=1, buffer_type="FinalSigma0.5", buffer_seed=0, buffer_size='1000K', cut_buffer_size='1000K', gamma=0.99, rollout=1000, loss_k=10000, max_ue_trainsteps=int(1e6), logger_kwargs=dict()): print('testing MClength:', rollout) print('Training loss ratio k:', loss_k) print('Discount value', gamma) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") print("running on device:", device) global logger logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) buffer_name = "%s_%s_%s" % (buffer_type, env_set, buffer_seed) setting_name = "%s_%s_r%s_g%s" % (buffer_name, cut_buffer_size, rollout, gamma) print("---------------------------------------") print("Settings: " + setting_name) print("---------------------------------------") if not os.path.exists("./results"): os.makedirs("./results") env = gym.make(env_set) env.seed(seed) torch.manual_seed(seed) np.random.seed(seed) state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] max_action = float(env.action_space.high[0]) # Load buffer replay_buffer = utils.ReplayBuffer() replay_buffer.load(buffer_name + '_' + buffer_size) if buffer_size != cut_buffer_size: replay_buffer.cut_final(int(cut_buffer_size[:-1]) * 1e3) print(replay_buffer.get_length()) print('buffer setting:', buffer_name + '_' + cut_buffer_size) if not os.path.exists('./results/ueMC_%s_Gt.npy' % setting_name): save_s = not os.path.exists("./results/ueMC_%s_S.npy" % (buffer_name + '_' + cut_buffer_size)) # extract (s,a,r) pairs from replay buffer length = replay_buffer.get_length() print(length) states, actions, gts = [], [], [] for ind in range(length): state, _, action, _, dint = replay_buffer.index(ind) gt = calculate_mc_ret(replay_buffer, ind, rollout=rollout, discount=gamma) gts.append(gt) states.append(state) actions.append(action) if save_s: np.save( './results/ueMC_%s_S' % (buffer_name + '_' + cut_buffer_size), states) np.save( './results/ueMC_%s_A' % (buffer_name + '_' + cut_buffer_size), actions) np.save('./results/ueMC_%s_Gt' % setting_name, gts) print('ue train starts ==') states = np.load('./results/ueMC_%s_S.npy' % (buffer_name + '_' + cut_buffer_size), allow_pickle=True) actions = np.load('./results/ueMC_%s_A.npy' % (buffer_name + '_' + cut_buffer_size), allow_pickle=True) gts = np.load('./results/ueMC_%s_Gt.npy' % setting_name, allow_pickle=True) upper_envelope, ue_lossval = train_upper_envelope(states, actions, gts, state_dim, device, seed, \ max_step_num=max_ue_trainsteps, k=loss_k, logger=logger) torch.save(upper_envelope.state_dict(), '%s/%s_UE.pth' % ("./pytorch_models", setting_name + \ '_s%s_lok%s'%(seed, loss_k))) print('ue train finished --') print('plotting ue --') upper_envelope = Value(state_dim, activation='relu') upper_envelope.load_state_dict(torch.load('%s/%s_UE.pth' % ("./pytorch_models", setting_name + \ '_s%s_lok%s'%(seed, loss_k)))) plot_envelope(upper_envelope, states, actions, gts, \ setting_name+'[k=%s_MClen=%s_gamma=%s'%(loss_k, rollout, gamma)+'loss%.2f'%ue_lossval, seed)
def ddpg_genbuf(env_set="Hopper-v2", seed=0, max_timesteps=float(1e6), start_timesteps=int(1e3), expl_noise=0.5, eval_freq='episode_timesteps', logger_kwargs=dict()): device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") print("running on device:", device) """set up logger""" global logger logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) file_name = "DDPG_%s_%s" % (env_set, str(seed)) buffer_name = "FinalSigma%s_%s_%s_%sK" % (str(expl_noise), env_set, str(seed), str(int(max_timesteps/1e3))) exp_name = "ddpg_collection_%s_steps%s_sigma%s_%s" \ % (env_set, str(max_timesteps), str(expl_noise), str(seed)) print ("---------------------------------------") print ("Settings: " + file_name) print ("Save Buffer as: " + buffer_name) print ("---------------------------------------") if not os.path.exists("./pytorch_models"): os.makedirs("./pytorch_models") env = gym.make(env_set) test_env = gym.make(env_set) # Set seeds '''for algos with environment interacts we also have to seed env.action_space''' env.seed(seed) test_env.seed(seed) env.action_space.np_random.seed(seed) test_env.action_space.np_random.seed(seed) torch.manual_seed(seed) np.random.seed(seed) state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] max_action = float(env.action_space.high[0]) print('max episode length', env._max_episode_steps) # Initialize policy and buffer policy = DDPG_col.DDPG(state_dim, action_dim, max_action) replay_buffer = utils.ReplayBuffer() total_timesteps = 0 episode_num = 0 done = True while total_timesteps < max_timesteps: if done: if total_timesteps != 0: policy.train(replay_buffer, episode_timesteps) avgtest_reward = evaluate_policy(policy, test_env, eval_episodes=10) logger.log_tabular('Episode', episode_num) logger.log_tabular('AverageTestEpRet', avgtest_reward) logger.log_tabular('TotalSteps', total_timesteps) logger.log_tabular('EpRet', episode_reward) logger.log_tabular('EpLen', episode_timesteps) logger.dump_tabular() # Reset environment obs = env.reset() done = False episode_reward = 0 episode_timesteps = 0 episode_num += 1 # Select action randomly or according to policy if total_timesteps < start_timesteps: action = env.action_space.sample() else: action = policy.select_action(np.array(obs)) if expl_noise != 0: action = (action + np.random.normal(0, expl_noise, size=env.action_space.shape[0]))\ .clip(env.action_space.low, env.action_space.high) # Perform new action!!! new_obs, reward, done, _ = env.step(action) episode_reward += reward episode_timesteps += 1 total_timesteps += 1 done_bool = 0 if episode_timesteps == env._max_episode_steps else float(done) # Store data in replay buffer replay_buffer.add((obs, new_obs, action, reward, done_bool)) obs = new_obs # Save final policy policy.save("%s" % (file_name), directory="./pytorch_models") # Save final buffer replay_buffer.save(buffer_name)
def bc_ue_learn(env_set="Hopper-v2", seed=0, buffer_type="FinalSigma0.5", buffer_seed=0, buffer_size='1000K', cut_buffer_size='1000K', ue_seed_list=[1], gamma=0.99, ue_rollout=1000, ue_loss_k=10000, clip_ue=None, detect_interval=10000, eval_freq=float(500), max_timesteps=float(1e5), lr=1e-3, wd=0, border=0.9, logger_kwargs=dict()): device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") print("running on device:", device) """set up logger""" global logger logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) rollout_list = [None, 1000, 200, 100, 10] k_list = [10000, 1000, 100, 100000, 50000, 5000] file_name = "BCueclip_%s_%s" % (env_set, seed) buffer_name = "%s_%s_%s" % (buffer_type, env_set, buffer_seed) setting_name = "%s_%s_r%s_g%s" % (buffer_name, cut_buffer_size, ue_rollout, gamma) print ("---------------------------------------") print ("Settings: " + file_name) print ("---------------------------------------") env = gym.make(env_set) test_env = gym.make(env_set) # Set seeds env.seed(seed) test_env.seed(seed) env.action_space.np_random.seed(seed) test_env.action_space.np_random.seed(seed) torch.manual_seed(seed) np.random.seed(seed) state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] max_action = float(env.action_space.high[0]) # Load buffer replay_buffer = utils.ReplayBuffer() replay_buffer.load(buffer_name + '_' + buffer_size) if buffer_size != cut_buffer_size: replay_buffer.cut_final(int(cut_buffer_size[:-1]) * 1e3) print(replay_buffer.get_length()) print('buffer setting:', buffer_name + '_' + cut_buffer_size) print('clip and selection type:', clip_ue) env_bs_dic = { 'Hopper-v2': [4, 4], 'Walker2d-v2': [3, 5], 'HalfCheetah-v2': [1, 1] } if clip_ue is None: best_ue_seed = env_bs_dic[env_set][buffer_seed] C = None elif clip_ue == "s-auto": best_ue_seed = env_bs_dic[env_set][buffer_seed] print('-- Do clipping on the selected envelope --') C, _ = get_ue_clipping_info(best_ue_seed, ue_loss_k, detect_interval, setting_name, state_dim,\ buffer_info=buffer_name + '_' + cut_buffer_size, ue_setting='[k=%s_MClen=%s_gamma=%s'%(ue_loss_k, ue_rollout, gamma)) elif clip_ue == "f-auto": print('-- Do clipping on each envelope --') ues_info = dict() for ue_seed in ue_seed_list: ues_info[ue_seed] = get_ue_clipping_info(ue_seed, ue_loss_k, detect_interval, setting_name, state_dim,\ buffer_info=buffer_name + '_' + cut_buffer_size, ue_setting='[k=%s_MClen=%s_gamma=%s'%(ue_loss_k, ue_rollout, gamma)) print('Auto clipping info:', ues_info) clipping_val_list, clipping_loss_list = tuple( map(list, zip(*ues_info.values()))) sele_idx = int(np.argmin(np.array(clipping_loss_list))) best_ue_seed = ue_seed_list[sele_idx] C = clipping_val_list[sele_idx] print("Best UE", best_ue_seed, "Clipping value: ", C) print('-- Policy train starts --') gts = np.load('./results/ueMC_%s_Gt.npy' % setting_name, allow_pickle=True) print('Load best envelope from', './results/ueMC_%s_Gt.npy' % setting_name) upper_envelope = Value(state_dim, activation='relu') upper_envelope.load_state_dict( torch.load('%s/%s_UE.pth' % ("./pytorch_models", setting_name + '_s%s_lok%s' % (best_ue_seed, ue_loss_k)))) print( 'Load best envelope from', '%s/%s_UE.pth' % ("./pytorch_models", setting_name + '_s%s_lok%s' % (best_ue_seed, ue_loss_k))) print('with testing MClength:', ue_rollout, 'training loss ratio k:', ue_loss_k) #plot_envelope(upper_envelope, states, actions, gts, buffer_name, seed) # Initialize policy policy = BC_ue_border_clip.BC_ue(state_dim, action_dim, max_action, lr=lr, wd=wd, ue_valfunc=upper_envelope, mc_rets=gts) episode_num = 0 done = True training_iters, epoch = 0, 0 while training_iters < max_timesteps: epoch += 1 pol_vals = policy.train(replay_buffer, iterations=int(eval_freq), border=border, logger=logger, C=C) avgtest_reward = evaluate_policy(policy, test_env) training_iters += eval_freq logger.log_tabular('Epoch', epoch) logger.log_tabular('AverageTestEpRet', avgtest_reward) logger.log_tabular('TotalSteps', training_iters) logger.log_tabular('Loss', average_only=True) logger.log_tabular('SVal', with_min_and_max=True) logger.log_tabular('UpSize', with_min_and_max=True) logger.dump_tabular()