def load_checkpoint(checkpoint_path, rb_path, policy, args): fpath = os.path.join(checkpoint_path, 'model.pyth') checkpoint = torch.load(fpath, map_location='cpu') # change to default graph before loading policy.change_morphology([-1]) # load and return checkpoint policy.actor.load_state_dict(checkpoint['actor_state']) policy.critic.load_state_dict(checkpoint['critic_state']) policy.actor_target.load_state_dict(checkpoint['actor_target_state']) policy.critic_target.load_state_dict(checkpoint['critic_target_state']) policy.actor_optimizer.load_state_dict(checkpoint['actor_optimizer_state']) policy.critic_optimizer.load_state_dict( checkpoint['critic_optimizer_state']) # load replay buffer all_rb_files = [f[:-4] for f in os.listdir(rb_path) if '.npy' in f] all_rb_files.sort() replay_buffer_new = dict() for name in all_rb_files: if len(all_rb_files) > args.rb_max // 1e6: replay_buffer_new[name] = utils.ReplayBuffer( max_size=args.rb_max // len(all_rb_files)) else: replay_buffer_new[name] = utils.ReplayBuffer() replay_buffer_new[name].max_size = int(checkpoint['rb_max'][name]) replay_buffer_new[name].ptr = int(checkpoint['rb_ptr'][name]) replay_buffer_new[name].slicing_size = checkpoint['rb_slicing_size'][ name] replay_buffer_new[name].storage = list( np.load(os.path.join(rb_path, '{}.npy'.format(name)))) return checkpoint['total_timesteps'], \ checkpoint['episode_num'], \ replay_buffer_new, \ checkpoint['num_samples'], \ fpath
def __init__(self, params): self.state_size = params['state_size'] self.action_size = params['action_size'] self.buffer_size = params['buffer_size'] self.batch_size = params['batch_size'] self.nb_agents = params['nb_agents'] self.learning_rate_Q = params['learning_rate_Q'] self.learning_rate_mu = params['learning_rate_mu'] self.memory = utils.ReplayBuffer(self.buffer_size, self.batch_size) self.device = params['device'] self.tau = params['tau'] self.gamma = params['gamma'] self.Q = network.Q_estimator( self.state_size*self.nb_agents, self.action_size*self.nb_agents ).to(self.device) self.Q_hat = network.Q_estimator( self.state_size*self.nb_agents, self.action_size*self.nb_agents ).to(self.device) self.Q_hat.load_state_dict(self.Q.state_dict()) self.optim_Q = torch.optim.Adam(self.Q.parameters(), lr=self.learning_rate_Q) self.mu = network.mu_estimator(self.state_size, self.action_size).to(self.device) self.mu_hat = network.mu_estimator(self.state_size, self.action_size).to(self.device) self.mu_hat.load_state_dict(self.mu.state_dict()) self.optim_mu = torch.optim.Adam(self.mu.parameters(), lr=self.learning_rate_mu)
def test_procedure(shared_actor, env): num_actions = env.action_space.n local_actor = nets.Actor(num_actions=num_actions) # load parameters from shared models begin_time = time.time() while True: replay_buffer = utils.ReplayBuffer(size=4, frame_history_len=4) local_actor.load_state_dict(shared_actor.state_dict()) obs = env.reset() rewards = [] while True: replay_buffer.store_frame(obs) states = replay_buffer.encode_recent_observation() states = np.expand_dims(states, axis=0) / 255.0 - .5 logits = local_actor( Variable(torch.FloatTensor(states.astype(np.float32)))) action = utils.epsilon_greedy(logits, num_actions=env.action_space.n, epsilon=-1.) obs, reward, done, info = env.step(action) rewards.append(reward) if done: print("Time:{}, computer:{}, agent:{}".format( time.time() - begin_time, sum(np.array(rewards) == -1), sum(np.array(rewards) == 1))) break
def __init__(self, headless=1): self.gamma = 0.99 self.batch_size = 128 self.critic_learning_rate = 0.005 self.actor_learning_rate = 0.005 self.tau = 0.001 # copy rate between target net and real net ## ENVIRONMENT self.env = utils.Denv(headless=headless, location="donkey-generated-roads-v0") ## POLICY self.critic = Critic(obs_dim=1, action_dim=2) self.critic_target = Critic(obs_dim=1, action_dim=2) # Copy critic target parameters for target_param, param in zip(self.critic_target.parameters(), self.critic.parameters()): target_param.data.copy_(param.data) self.actor = Actor() self.actor_target = Actor() # OPTIMIZERS self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=self.critic_learning_rate) self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=self.actor_learning_rate) ## MEMORY self.memory = utils.ReplayBuffer(capacity=5000, seed=0)
def main(args): r = redis.Redis(host='10.10.1.2', port=6379, db=0) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') model_obs_dim, model_output_dim = 4, 6 # model_obs_dim, model_output_dim = np.size(utils.HL_obs(state)), np.size(utils.HL_delta_obs(state, state)) utils.make_dir(args.save_dir) HL_replay_buffer = utils.ReplayBuffer( model_obs_dim, args.z_dim, model_output_dim, device, args.num_iters * args.num_latent_action_per_iteration) HL_replay_buffer.load_buffer('./save_data/trial_4') # HL_replay_buffer.idx = 1415 high_level_planning = HLPM.high_level_planning( device=device, model_obs_dim=model_obs_dim, z_dim=args.z_dim, model_output_dim=model_output_dim, model_hidden_num=args.model_hidden_num, batch_size=args.batch_size, model_lr=args.model_lr, high_level_policy_type=args.high_level_policy_type, update_sample_policy=args.update_sample_policy, update_sample_policy_lr=args.update_sample_policy_lr, low_level_policy_type=args.low_level_policy_type, num_timestep_per_footstep=args.num_timestep_per_footstep, model_update_steps=args.model_update_steps, control_frequency=args.control_frequency) # collect_data_client(args, r, high_level_planning , HL_replay_buffer) train_model(args, HL_replay_buffer, high_level_planning)
def train_BCQ(state_dim, action_dim, max_action, device, args): # For saving files setting = f"{args.env}_{args.seed}" buffer_name = f"{args.buffer_name}_{setting}" # Initialize policy policy = BCQ.BCQ(state_dim, action_dim, max_action, device, args.discount, args.tau, args.lmbda, args.phi) # Load buffer replay_buffer = utils.ReplayBuffer(state_dim, action_dim, device) replay_buffer.load(f"./buffers/{buffer_name}", args.load_buffer_size) evaluations = [] episode_num = 0 done = True training_iters = 0 while training_iters < args.max_timesteps: pol_vals = policy.train(replay_buffer, iterations=int(args.eval_freq), batch_size=args.batch_size) evaluations.append(eval_policy(policy, args.env, args.seed)) np.save( f"./results/BCQ_N{args.load_buffer_size}_phi{args.phi}_{buffer_name}", evaluations) training_iters += args.eval_freq print(f"Training iterations: {training_iters}")
def make_buffer(hdf5_path): """ Add transition tuples from batch file to replay buffer. """ rb = utils.ReplayBuffer() f = h5py.File(hdf5_path, "r") demos = list(f["data"].keys()) total_transitions = f["data"].attrs["total"] print("Loading {} transitions from {}...".format(total_transitions, hdf5_path)) env_name = f["data"].attrs["env"] for i in range(len(demos)): ep = demos[i] obs = f["data/{}/obs".format(ep)][()] actions = f["data/{}/actions".format(ep)][()] rewards = f["data/{}/rewards".format(ep)][()] next_obs = f["data/{}/next_obs".format(ep)][()] dones = f["data/{}/dones".format(ep)][()] ### important: this is action clipping! ### actions = np.clip(actions, -1., 1.) zipped = zip(obs, actions, rewards, next_obs, dones) for item in zipped: ob, ac, rew, next_ob, done = item # Expects tuples of (state, next_state, action, reward, done) rb.add((ob, next_ob, ac, rew, done)) f.close() return rb, env_name
def __init__(self, agent_dict={}, actor_dict={}, critic_dict={}): """ Initialize Agent object Params ====== agent_dict(dict): dictionary containing parameters for agent actor_dict(dict): dictionary containing parameters for agents actor-model critic_dict(dict): dictionary containing parameters for agents critic-model """ enable_cuda = agent_dict.get("enable_cuda", False) if enable_cuda: self.device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") else: self.device = torch.device("cpu") self.num_agents = agent_dict.get("num_agents", 20) self.num_episodes = agent_dict.get("num_episodes", 10000) self.save_after = agent_dict.get("save_after", -1) self.name = agent_dict.get("name", "reacher") self.gamma = agent_dict.get("gamma", 0.9) self.tau = agent_dict.get("tau", 0.001) self.noise = utils.OUNoise((self.num_agents, 4), 0) self.num_replays = agent_dict.get("num_replays", 1) self.learning_rate_actor = agent_dict.get("learning_rate_actor", 1E-3) self.learning_rate_critic = agent_dict.get("learning_rate_critic", 1E-3) self.criterion = nn.MSELoss() memory_size = agent_dict.get("memory_size", 2**14) batchsize = agent_dict.get("batchsize", 2**10) replay_reg = agent_dict.get("replay_reg", 0.0) self.replay_buffer = utils.ReplayBuffer(memory_size, batchsize) self.actor = model.ActorModel(actor_dict).to(self.device) self.actor_target = model.ActorModel(actor_dict).to(self.device) self.critic = model.CriticModel(critic_dict).to(self.device) self.critic_target = model.CriticModel(critic_dict).to(self.device) self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=self.learning_rate_actor) self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=self.learning_rate_critic) utils.copy_model(self.actor, self.actor_target, tau=1.0) utils.copy_model(self.critic, self.critic_target, tau=1.0) seed = agent_dict.get("seed", 0) torch.manual_seed(seed) np.random.seed(seed)
def train(sess, env, args, actor, critic, action_bound): actor_loss = -tf.reduce_mean(critic.total_out) actor_train_step = tf.train.AdamOptimizer(args['actor_lr']).minimize( actor_loss, var_list=actor.network_params) sess.run(tf.global_variables_initializer()) saver = tf.train.Saver() actor.update_target_network() critic.update_target_network() replay_buffer = utils.ReplayBuffer() total_timesteps = 0 timesteps_since_eval = 0 episode_num = 0 done = True while total_timesteps < args['max_timesteps']: if total_timesteps != 0 and total_timesteps % args[ 'save_timesteps'] == 0: saver.save(sess, os.path.join(args['save_dir'], args['env'])) if done: if total_timesteps != 0: print("total - ", total_timesteps, "episode num ", episode_num, "episode reward ", episode_reward) training(sess, actor, critic, actor_train_step, action_bound, replay_buffer, args, episode_timesteps) if total_timesteps != 0 and episode_num % args[ 'eval_episodes'] == 0: print('starting evaluation') eval(env, actor) s = env.reset() done = False episode_reward = 0 episode_timesteps = 0 episode_num += 1 if total_timesteps < args['start_timesteps']: action = env.action_space.sample() else: action = actor.predict(np.reshape(s, (1, actor.s_dim))) action = (action + np.random.normal( 0, args['expl_noise'], size=action.shape)).clip( env.action_space.low, env.action_space.high) action = np.reshape(action, [-1]) # print('new shape is ', action.shape) s2, r, done, info = env.step(action) episode_reward += r done_bool = 0 if episode_timesteps + 1 == env._max_episode_steps else float( done) replay_buffer.add((s, s2, action, r, done_bool)) s = s2 episode_timesteps += 1 total_timesteps += 1 timesteps_since_eval += 1
def main(cfg): # define env & high level planning part & low level trajectory generator & replay buffer for HLP # initialize logger device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') env = daisy_API(sim=cfg.sim, render=False, logger=False) env.set_control_mode(cfg.control_mode) state = env.reset() com_utils = utils.CoM_frame_MPC() if cfg.sim: init_state = motion_library.exp_standing(env) model_obs_dim, model_output_dim = np.size( com_utils.HL_obs(state)), np.size(com_utils.HL_delta_obs(state, state)) HL_replay_buffer = utils.ReplayBuffer( model_obs_dim, cfg.z_dim, model_output_dim, device, cfg.num_iters * cfg.num_latent_action_per_iteration) high_level_planning = HLPM.high_level_planning( device=device, model_obs_dim=model_obs_dim, z_dim=cfg.z_dim, model_output_dim=model_output_dim, model_hidden_num=cfg.model_hidden_num, model_layer_num=cfg.model_layer_num, batch_size=cfg.batch_size, model_lr=cfg.model_lr, high_level_policy_type=cfg.high_level_policy_type, update_sample_policy=cfg.update_sample_policy, update_sample_policy_lr=cfg.update_sample_policy_lr, low_level_policy_type=cfg.low_level_policy_type, num_timestep_per_footstep=cfg.num_timestep_per_footstep, model_update_steps=cfg.model_update_steps, control_frequency=cfg.control_frequency) low_level_TG = LLTG.low_level_TG( device=device, z_dim=cfg.z_dim, a_dim=cfg.a_dim, num_timestep_per_footstep=cfg.num_timestep_per_footstep, batch_size=cfg.batch_size, low_level_policy_type=cfg.low_level_policy_type, update_low_level_policy=cfg.update_low_level_policy, update_low_level_policy_lr=cfg.update_low_level_policy_lr, init_state=init_state, ) if cfg.low_level_policy_type == 'NN': low_level_TG.load_model('.') # # # collect data collect_data(cfg, env, high_level_planning, low_level_TG, HL_replay_buffer, com_utils) # train model train_model(cfg, HL_replay_buffer, high_level_planning)
def create_replay_buffer(args: argparse.Namespace, env: utils.FrameStack, device: torch.device) -> utils.ReplayBuffer: """"Method to create a replay buffer""" return utils.ReplayBuffer( obs_shape=env.observation_space.shape, action_shape=env.action_space.shape, capacity=args.replay_buffer_capacity, batch_size=args.batch_size, device=device, )
def main(cfg): print(cfg.pretty()) # define env & high level planning part & low level trajectory generator & replay buffer for HLP # initialize logger device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') env = daisy_API(sim=cfg.sim, render=False, logger=False) env.set_control_mode(cfg.control_mode) state = env.reset() if cfg.sim: init_state = motion_library.exp_standing(env) model_obs_dim, model_output_dim = 2, 5 HL_replay_buffer = utils.ReplayBuffer( model_obs_dim, cfg.z_dim, model_output_dim, device, cfg.num_iters * cfg.num_latent_action_per_iteration) high_level_planning = HLPM.high_level_planning( device=device, model_obs_dim=model_obs_dim, z_dim=3, model_output_dim=model_output_dim, model_hidden_num=cfg.model_hidden_num, model_layer_num=2, batch_size=cfg.batch_size, model_lr=cfg.model_lr, high_level_policy_type='raibert', update_sample_policy=cfg.update_sample_policy, update_sample_policy_lr=cfg.update_sample_policy_lr, low_level_policy_type=cfg.low_level_policy_type, num_timestep_per_footstep=50, model_update_steps=cfg.model_update_steps, control_frequency=cfg.control_frequency) low_level_TG = LLTG.low_level_TG( device=device, z_dim=3, a_dim=cfg.a_dim, num_timestep_per_footstep=50, batch_size=cfg.batch_size, low_level_policy_type='IK', update_low_level_policy=cfg.update_low_level_policy, update_low_level_policy_lr=cfg.update_low_level_policy_lr, init_state=init_state, ) # # if args.low_level_policy_type =='NN': # # low_level_TG.load_model('./save_data/trial_2') # # # collect data collect_data(cfg, env, high_level_planning, low_level_TG, HL_replay_buffer)
def train_BCQ(state_dim, action_dim, max_action, device, args): # For saving files setting = f"{args.env}_{args.seed}" buffer_name = f"{args.buffer_name}_{setting}" # Initialize policy if args.model == 'BCQ': policy = BCQ.BCQ(state_dim, action_dim, max_action, device, args.discount, args.tau, args.lmbda, args.phi) elif args.model == 'BCQREM': policy = BCQREM.BCQ(state_dim, action_dim, max_action, device, args.discount, args.tau, args.lmbda, args.phi) elif args.model == 'BCQREMshareQ': policy = BCQREM_share_Qparam.BCQ(state_dim, action_dim, max_action, device, args.discount, args.tau, args.lmbda, args.phi) elif args.model == 'BCQREMshareVQ': policy = BCQREM_share_VQparam.BCQ(state_dim, action_dim, max_action, device, args.discount, args.tau, args.lmbda, args.phi) # Load buffer replay_buffer = utils.ReplayBuffer(state_dim, action_dim, device) replay_buffer.load(f"./buffers/{buffer_name}") evaluations = [] episode_num = 0 done = True training_iters = 0 print("NO.1 evaluations...") evaluations.append(eval_policy(policy, args.env, args.seed)) while training_iters < args.max_timesteps: vae_loss, actor_loss, critic_loss = policy.train( replay_buffer, iterations=int(args.eval_freq), batch_size=args.batch_size) # vae_loss, actor_loss, critic_loss = round(vae_loss.item(), 5), round(actor_loss.item(), 5), round(critic_loss.item(), 5) # print(f'times:{training_iters}/[{args.max_timesteps}], VAE: {vae_loss}, Actor: {actor_loss}, Critic: {critic_loss}') evaluations.append(eval_policy(policy, args.env, args.seed)) if args.model == 'BCQ': np.save(f"./results/BCQ_{setting}", evaluations) elif args.model == 'BCQREM': np.save(f"./results/BCQREM_{setting}", evaluations) elif args.model == 'BCQREMshareQ': np.save(f"./results/BCQREMshareQ_{setting}", evaluations) elif args.model == 'BCQREMshareVQ': np.save(f"./results/BCQREMshareVQ_{setting}", evaluations) training_iters += args.eval_freq print(f"Training iterations: {training_iters}")
def train_BEAR(state_dim, action_dim, max_action, device, args): print("Training BEAR\n") setting = f"{args.env}_{args.seed}" buffer_name = f"{args.buffer_name}_{setting}" hp_setting = f"N{args.load_buffer_size}_phi{args.phi}_n{args.n_action}_ne{args.n_action_execute}" \ f"_{args.score_activation}_k{str(args.sigmoid_k)}_betac{str(args.beta_c)}_betaa{str(args.beta_a)}" # Initialize policy policy = BEAR.BEAR(2, state_dim, action_dim, max_action, delta_conf=0.1, use_bootstrap=False, version=args.version, lambda_=0.0, threshold=0.05, mode=args.mode, num_samples_match=args.num_samples_match, mmd_sigma=args.mmd_sigma, lagrange_thresh=args.lagrange_thresh, use_kl=(True if args.distance_type == "KL" else False), use_ensemble=(False if args.use_ensemble_variance == "False" else True), kernel_type=args.kernel_type, actor_lr=args.actor_lr) # Load buffer replay_buffer = utils.ReplayBuffer(state_dim, action_dim, device) replay_buffer.load(f"./buffers/Extended-{buffer_name}", args.load_buffer_size, bootstrap_dim=4) if args.actor_lr != 1e-3: hp_setting += f"_lr{args.actor_lr}" evaluations = [] episode_num = 0 done = True training_iters = 0 while training_iters < args.max_timesteps: pol_vals = policy.train(replay_buffer, iterations=int(args.eval_freq), batch_size=args.batch_size) evaluations.append(eval_policy(policy, args.env, args.seed)) np.save(f"./results/BEAR3_{hp_setting}_{buffer_name}", evaluations) training_iters += args.eval_freq print(f"Training iterations: {training_iters}")
def load_checkpoint(checkpoint_path, rb_path, policy, args): fpath = os.path.join(checkpoint_path, "model.pyth") checkpoint = torch.load(fpath, map_location="cpu") # change to default graph before loading policy.change_morphology([-1]) # load and return checkpoint policy.actor.load_state_dict(checkpoint["actor_state"]) policy.critic.load_state_dict(checkpoint["critic_state"]) policy.actor_target.load_state_dict(checkpoint["actor_target_state"]) policy.critic_target.load_state_dict(checkpoint["critic_target_state"]) policy.actor_optimizer.load_state_dict(checkpoint["actor_optimizer_state"]) policy.critic_optimizer.load_state_dict( checkpoint["critic_optimizer_state"]) # load replay buffer all_rb_files = [f[:-4] for f in os.listdir(rb_path) if ".npy" in f] all_rb_files.sort() replay_buffer_new = dict() for name in all_rb_files: if len(all_rb_files) > args.rb_max // 1e6: replay_buffer_new[name] = utils.ReplayBuffer( max_size=args.rb_max // len(all_rb_files)) else: replay_buffer_new[name] = utils.ReplayBuffer() replay_buffer_new[name].max_size = int(checkpoint["rb_max"][name]) replay_buffer_new[name].ptr = int(checkpoint["rb_ptr"][name]) replay_buffer_new[name].slicing_size = checkpoint["rb_slicing_size"][ name] replay_buffer_new[name].storage = list( np.load(os.path.join(rb_path, "{}.npy".format(name)))) return ( checkpoint["total_timesteps"], checkpoint["episode_num"], replay_buffer_new, checkpoint["num_samples"], fpath, )
def __init__(self, state_dim, action_dim, action_bound, discount_factor=1, seed=1, actor_lr=1e-3, critic_lr=1e-3, batch_size=100, namescope='default', tau=0.005, policy_noise=0.1, noise_clip=0.5, hidden_size=300): np.random.seed(int(seed)) tf.set_random_seed(seed) self.state_dim = state_dim self.action_dim = action_dim # env.seed(int(seed)) self.policy_noise = policy_noise self.noise_clip = noise_clip self.discount_factor = discount_factor self.batch_size = batch_size self.sess = tf.Session() self.hidden_size = hidden_size self.actor = Actor(self.sess, state_dim, action_dim, action_bound, actor_lr, tau, int(batch_size), self.hidden_size, namescope=namescope + str(seed)) self.critic = Critic(self.sess, state_dim, action_dim, critic_lr, tau, self.actor.scaled_out, self.hidden_size, namescope=namescope + str(seed)) actor_loss = -tf.reduce_mean(self.critic.total_out) self.actor_train_step = tf.train.AdamOptimizer(actor_lr).minimize( actor_loss, var_list=self.actor.network_params) self.action_bound = action_bound self.sess.run(tf.global_variables_initializer()) self.replay_buffer = utils.ReplayBuffer()
def __init__(self, env_name='Hopper-v2', total_episodes=1000, action_bound=1, episode_length=1000, learning_rate=0.02, weight=0.01, learning_steps=100, num_samples=8, noise=0.02, bc_index=[], std_dev=0.03, syn_step=10, num_best=4, meta_population_size=5, seed=1, hidden_size=300, coefficient=1): self.env = gym.make(env_name) np.random.seed(seed) self.env.seed(seed) self.action_bound = action_bound self.input_size = self.env.observation_space.shape[0] self.output_size = self.env.action_space.shape[0] self.total_episodes = total_episodes self.episode_length = episode_length self.lr = learning_rate self.num_best = num_best self.num_samples = num_samples self.noise = noise self.meta_population_size = meta_population_size self.seed = seed self.syn_step = syn_step self.coefficient = coefficient self.learning_steps = learning_steps self.bc_index = bc_index self.weight = weight self.normalizer = utils.Normalizer(self.env.observation_space.shape[0]) self.hidden_size = hidden_size self.stddev = std_dev self.intrinsic_network = IntrinsicNetwork(state_dim=self.input_size, action_dim=self.output_size, seed=self.seed, namescope=str(seed), weight=self.weight) self.replay = utils.ReplayBuffer()
def __init__(self, config): self.config = config buffer_config = {'seed': config['seed'], 'size': config['buffer_size']} self.buffer = utils.ReplayBuffer(config=buffer_config) self.rng = np.random.default_rng(config['seed']) net_config = { 'seed': config['seed'], 'depth': config['network_depth'], 'width': config['network_width'], 'num_actions': config['num_actions'], 'input_dim': config['input_dim'] } self.qnet = QNet(net_config) self.targetnet = QNet(net_config) self.optimizer = torch.optim.Adam(self.qnet.parameters(), lr=config['step_size']) self.loss = torch.nn.MSELoss()
def moduleShow(args): env = gym.make(args.env_name) state_dim = env.observation_space["observation"].shape[ 0] + env.observation_space["desired_goal"].shape[0] #state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] max_action = float(env.action_space.high[0]) # Initialize policy if args.policy_name == "TD3": policy = TD3.TD3(state_dim, action_dim, max_action) elif args.policy_name == "OurDDPG": policy = OurDDPG.DDPG(state_dim, action_dim, max_action) elif args.policy_name == "DDPG": policy = DDPG.DDPG(state_dim, action_dim, max_action) replay_buffer = utils.ReplayBuffer() # Evaluate untrained policy evaluations = [evaluate_policy(policy)] obs = env.reset()
def train_BCQ(state_dim, action_dim, max_action, device, args): # For saving files setting = f"{args.env}_{args.seed}" buffer_name = f"{args.buffer_name}_{setting}" # Initialize policy policy = BCQ_brain.BCQ(state_dim, action_dim, max_action, device, args.discount, args.tau, args.lmbda, args.phi) # Load buffer replay_buffer = utils.ReplayBuffer(state_dim, action_dim, device) replay_buffer.load(f"./buffers/{buffer_name}") training_iters = 0 while training_iters < args.max_timesteps: pol_vals = policy.train(replay_buffer, iterations=int(args.eval_freq), batch_size=args.batch_size) training_iters += args.eval_freq #eval_policy(policy, training_iters) print(f"Training iterations: {training_iters}") return policy.actor_loss, policy.critic_loss, policy.vae_loss
def test_vae_state(state_dim, action_dim, max_state, max_action, device, args): # For saving files setting = f"{args.env}_{args.seed}" buffer_name = f"{args.buffer_name}_{setting}" hp_setting = f"{args.score_activation}_k{str(args.sigmoid_k)}_betac{str(args.beta_c)}_betaa{str(args.beta_a)}" # Initialize policy policy = BCQ.BCQ_state(state_dim, action_dim, max_state, max_action, device, args.discount, args.tau, args.lmbda, args.phi, beta_a=args.beta_a, beta_c=args.beta_c, sigmoid_k=args.sigmoid_k) # Load buffer replay_buffer = utils.ReplayBuffer(state_dim, action_dim, device) replay_buffer.load(f"./buffers/{buffer_name}", args.load_buffer_size) training_iters = 0 while training_iters < int(args.max_timesteps / 5): vae_loss = policy.train_vae(replay_buffer, iterations=int(args.eval_freq), batch_size=args.batch_size) print(f"Training iterations: {training_iters}") print("VAE loss", vae_loss) training_iters += args.eval_freq policy.vae2.save(f"./models/vae_{setting}") test_loss = policy.test_vae(replay_buffer, batch_size=100000) print(test_loss) np.save(f"./results/vae_pretrain/elbo_{args.seed}", test_loss)
def train_DBCQ(dargs, device): if not os.path.exists("./results"): os.makedirs("./results") if not os.path.exists("./models"): os.makedirs("./models") # For saving files setting = f"{dargs.env}_{dargs.seed}" buffer_name = f"{dargs.buffer_name}_{setting}" # Initialize policy policy = DBCQ.DBCQ(dargs.parameters, dargs.env_properties, device) # Load buffer replay_buffer = utils.ReplayBuffer(dargs.env_properties["state_dim"], dargs.env_properties["num_actions"], device) replay_buffer.load(f"./buffers/{buffer_name}") evaluations = [] episode_num = 0 done = True training_iters = 0 while training_iters < dargs.max_timesteps: pol_vals = policy.train(replay_buffer) evaluations.append(eval_policy(policy, dargs.env, dargs.seed)) np.save(f"./results/BCQ_{setting}", evaluations) training_iters += dargs.eval_freq print(f"Training iterations: {training_iters}") return policy
kwargs["policy_freq"] = args.policy_freq policy = TD3.TD3(**kwargs) if args.policy == "DDPG": policy = DDPG.DDPG(**kwargs) if args.policy == "newDDPG": policy = newDDPG.DDPG(**kwargs) if args.policy == "newTD3": policy = newTD3.TD3(**kwargs) if args.policy == "A2C": policy = A2C.A2C(**kwargs) if args.load_model != "": policy_file = file_name if args.load_model == "default" else args.load_model policy.load(f"./models/{policy_file}") replay_buffer = utils.ReplayBuffer(state_dim, action_dim=action_dim) # Initialize environment minerEnv = MinerEnv(HOST, PORT) minerEnv.start() #init environment # Evaluate untrained policy evaluations = [eval_policy(policy, minerEnv)] train = False for episode_i in range(0, N_EPISODE): # Reset environment mapID = request_to_env(minerEnv, train) # init environment game minerEnv.reset() #action = policy.select_action(np.array(state))
def evaluate(env, agent, args, video, adapt=False): """Evaluate an agent, optionally adapt using PAD""" episode_rewards = [] for i in tqdm(range(args.pad_num_episodes)): ep_agent = deepcopy(agent) # make a new copy if args.use_curl: # initialize replay buffer for CURL replay_buffer = utils.ReplayBuffer( obs_shape=env.observation_space.shape, action_shape=env.action_space.shape, capacity=args.train_steps, batch_size=args.pad_batch_size) video.init(enabled=True) obs = env.reset() done = False episode_reward = 0 losses = [] step = 0 ep_agent.train() while not done: # Take step with utils.eval_mode(ep_agent): action = ep_agent.select_action(obs) next_obs, reward, done, _ = env.step(action) episode_reward += reward # Make self-supervised update if flag is true if adapt: if args.use_rot: # rotation prediction # Prepare batch of cropped observations batch_next_obs = utils.batch_from_obs( torch.Tensor(next_obs).cuda(), batch_size=args.pad_batch_size) batch_next_obs = utils.random_crop(batch_next_obs) # Adapt using rotation prediction losses.append(ep_agent.update_rot(batch_next_obs)) if args.use_inv: # inverse dynamics model # Prepare batch of observations batch_obs = utils.batch_from_obs( torch.Tensor(obs).cuda(), batch_size=args.pad_batch_size) batch_next_obs = utils.batch_from_obs( torch.Tensor(next_obs).cuda(), batch_size=args.pad_batch_size) batch_action = torch.Tensor(action).cuda().unsqueeze( 0).repeat(args.pad_batch_size, 1) # Adapt using inverse dynamics prediction losses.append( ep_agent.update_inv(utils.random_crop(batch_obs), utils.random_crop(batch_next_obs), batch_action)) if args.use_curl: # CURL # Add observation to replay buffer for use as negative samples # (only first argument obs is used, but we store all for convenience) replay_buffer.add(obs, action, reward, next_obs, True) # Prepare positive and negative samples obs_anchor, obs_pos = get_curl_pos_neg( next_obs, replay_buffer) # Adapt using CURL losses.append( ep_agent.update_curl(obs_anchor, obs_pos, ema=True)) video.record(env, losses) obs = next_obs step += 1 video.save( f'{args.mode}_pad_{i}.mp4' if adapt else f'{args.mode}_{i}.mp4') episode_rewards.append(episode_reward) return np.mean(episode_rewards)
if args.policy_name == "TD3": policy = TD3.TD3(state_dim, action_dim, max_action) elif args.policy_name == "BNNTD3": policy = BNNTD3.TD3(state_dim, action_dim, max_action) elif args.policy_name == "BootstrapTD3": if args.actor_branches > 0: actor_branches = args.actor_branches else: actor_branches = args.branches policy = BootstrapTD3.TD3(state_dim, action_dim, max_action, args.branches, actor_branches) elif args.policy_name == "OurDDPG": policy = OurDDPG.DDPG(state_dim, action_dim, max_action) elif args.policy_name == "DDPG": policy = DDPG.DDPG(state_dim, action_dim, max_action) replay_buffer = utils.ReplayBuffer() # Evaluate untrained policy evaluations = [evaluate_policy(policy)] total_timesteps = 0 timesteps_since_eval = 0 episode_num = 0 if args.actor_branches > 0: branches = args.actor_branches else: branches = args.branches branch = sample_branch(branches) done = True while total_timesteps < args.max_timesteps:
policy = TD3.TD3(**kwargs) if args.policy == "DDPG": policy = DDPG.DDPG(**kwargs) if args.policy == "newDDPG": policy = newDDPG.DDPG(**kwargs) if args.policy == "TD3_conv": policy = TD3_conv.TD3(**kwargs) if args.policy == "A2C": policy = A2C.A2C(**kwargs) if args.load_model != "": policy_file = file_name if args.load_model == "default" else args.load_model policy.load(f"./models/{policy_file}") replay_buffer = utils.ReplayBuffer(state_dim, action_dim=action_dim, max_size=int(10000)) # Initialize environment minerEnv = MinerEnv(HOST, PORT) minerEnv.start() #init environment # Evaluate untrained policy #evaluations = [eval_policy(policy, minerEnv)] train = False best_score = {1: 0, 2: 0, 3: 0, 4: 0} for episode_i in range(0, N_EPISODE): # Reset environment mapID = request_to_env(minerEnv, train) # init environment game
def main(): args = parse_args() if args.seed == -1: args.__dict__["seed"] = np.random.randint(1, 1000000) utils.set_seed_everywhere(args.seed) pre_transform_image_size = args.pre_transform_image_size if 'crop' in args.data_augs else args.image_size pre_image_size = args.pre_transform_image_size # record the pre transform image size for translation env = dmc2gym.make(domain_name=args.domain_name, task_name=args.task_name, seed=args.seed, visualize_reward=False, from_pixels=(args.encoder_type == 'pixel'), height=pre_transform_image_size, width=pre_transform_image_size, frame_skip=args.action_repeat) env.seed(args.seed) # stack several consecutive frames together if args.encoder_type == 'pixel': env = utils.FrameStack(env, k=args.frame_stack) # make directory ts = time.gmtime() ts = time.strftime("%m-%d", ts) env_name = args.domain_name + '-' + args.task_name exp_name = env_name + '-' + ts + '-im' + str(args.image_size) +'-b' \ + str(args.batch_size) + '-s' + str(args.seed) + '-' + args.encoder_type args.work_dir = args.work_dir + '/' + exp_name utils.make_dir(args.work_dir) video_dir = utils.make_dir(os.path.join(args.work_dir, 'video')) model_dir = utils.make_dir(os.path.join(args.work_dir, 'model')) buffer_dir = utils.make_dir(os.path.join(args.work_dir, 'buffer')) video = VideoRecorder(video_dir if args.save_video else None) with open(os.path.join(args.work_dir, 'args.json'), 'w') as f: json.dump(vars(args), f, sort_keys=True, indent=4) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') action_shape = env.action_space.shape if args.encoder_type == 'pixel': obs_shape = (3 * args.frame_stack, args.image_size, args.image_size) pre_aug_obs_shape = (3 * args.frame_stack, pre_transform_image_size, pre_transform_image_size) else: obs_shape = env.observation_space.shape pre_aug_obs_shape = obs_shape replay_buffer = utils.ReplayBuffer( obs_shape=pre_aug_obs_shape, action_shape=action_shape, capacity=args.replay_buffer_capacity, batch_size=args.batch_size, device=device, image_size=args.image_size, pre_image_size=pre_image_size, ) agent = make_agent(obs_shape=obs_shape, action_shape=action_shape, args=args, device=device) L = Logger(args.work_dir, use_tb=args.save_tb) episode, episode_reward, done = 0, 0, True start_time = time.time() for step in range(args.num_train_steps): # evaluate agent periodically if step % args.eval_freq == 0: L.log('eval/episode', episode, step) evaluate(env, agent, video, args.num_eval_episodes, L, step, args) if args.save_model: agent.save_curl(model_dir, step) if args.save_buffer: replay_buffer.save(buffer_dir) if done: if step > 0: if step % args.log_interval == 0: L.log('train/duration', time.time() - start_time, step) L.dump(step) start_time = time.time() if step % args.log_interval == 0: L.log('train/episode_reward', episode_reward, step) obs = env.reset() done = False episode_reward = 0 episode_step = 0 episode += 1 if step % args.log_interval == 0: L.log('train/episode', episode, step) # sample action for data collection if step < args.init_steps: action = env.action_space.sample() else: with utils.eval_mode(agent): action = agent.sample_action(obs / 255.) # run training update if step >= args.init_steps: num_updates = 1 for _ in range(num_updates): agent.update(replay_buffer, L, step) next_obs, reward, done, _ = env.step(action) # allow infinit bootstrap done_bool = 0 if episode_step + 1 == env._max_episode_steps else float( done) episode_reward += reward replay_buffer.add(obs, action, reward, next_obs, done_bool) obs = next_obs episode_step += 1
def train(sess, env, args, actor, critic, action_bound): # update actor network by the deterministic policy gradient: actor_loss = -tf.reduce_mean(critic.total_out_scaled) actor_train_step = tf.train.AdamOptimizer(args['actor_lr']).minimize( actor_loss, var_list=actor.network_params) # Set up summary Ops summary_ops, summary_vars = build_summaries() sess.run(tf.global_variables_initializer()) writer = tf.summary.FileWriter(args['summary_dir'], sess.graph) saver = tf.train.Saver() if tf.train.checkpoint_exists(os.path.join(args['save_dir'], args['env'])): saver.restore(sess, os.path.join(args['save_dir'], args['env'])) print("Loading pre-trained model...") actor.update_target_network() critic.update_target_network() replay_buffer = utils.ReplayBuffer() total_timesteps = 0 episode_num = 0 done = True episode_reward = 0 episode_timesteps = 0 while total_timesteps < args['max_timesteps']: # start the trained model after a while, i.e., after save_timesteps if total_timesteps != 0 and total_timesteps % args[ 'save_timesteps'] == 0: print('start saving ...') saver.save(sess, os.path.join(args['save_dir'], args['env'])) if done: # train if total_timesteps != 0: print("total - ", total_timesteps, "episode num ", episode_num, "episode reward ", episode_reward) training(sess, actor, critic, actor_train_step, action_bound, replay_buffer, args, episode_timesteps) # evaluate if total_timesteps != 0 and episode_num % args[ 'eval_episodes'] == 0: print('start evaluating ...') eval(env, actor) # book-keeping summary_str = sess.run(summary_ops, feed_dict={ summary_vars[0]: episode_reward, summary_vars[1]: episode_timesteps }) writer.add_summary(summary_str, total_timesteps) writer.flush() s = env.reset() done = False episode_reward = 0 episode_timesteps = 0 episode_num += 1 # sample action if total_timesteps < args['start_timesteps']: action = env.action_space.sample() else: action = actor.predict(np.reshape(s, (1, actor.s_dim))) action = (action + np.random.normal( 0, args['expl_noise'], size=action.shape)).clip( env.action_space.low, env.action_space.high) action = np.reshape(action, [-1]) s2, r, done, info = env.step(action) episode_reward += r done_bool = 0 if episode_timesteps + 1 == env._max_episode_steps else float( done) replay_buffer.add((s, s2, action, r, done_bool)) s = s2 episode_timesteps += 1 total_timesteps += 1
def main(args): device = "cuda" if torch.cuda.is_available() else "cpu" args.work_dir = os.path.join( args.work_dir, args.domain_name + "_" + args.task_name, args.exp_name, str(args.seed), ) os.makedirs(args.work_dir, exist_ok=True) with open(os.path.join(args.work_dir, "args.json"), "w") as f: json.dump(vars(args), f, sort_keys=True, indent=4) train_envs = [ utils.make_env(np.random.randint(0, 255), args) for i in range(args.num_envs) ] eval_envs = [ utils.make_env(np.random.randint(0, 255), args) for i in range(5) ] print("Train env backgrounds: ", [train_env.bg_color for train_env in train_envs]) print("Eval env backgrounds: ", [eval_env.bg_color for eval_env in eval_envs]) obs_shape = train_envs[0].observation_space.shape action_size = train_envs[0].action_space.shape[0] phi = Encoder(obs_shape, args.encoder_feature_dim).to(device) model = DynamicsModel(args.encoder_feature_dim, action_size).to(device) decoders = [ Decoder(obs_shape, args.encoder_feature_dim).to(device) for i in range(args.num_envs) ] opt = torch.optim.Adam(list(phi.parameters()) + list(model.parameters()), lr=args.lr) decoder_opt = torch.optim.Adam(np.concatenate( [list(decoder.parameters()) for decoder in decoders]), lr=args.lr) train_replay_buffer = utils.ReplayBuffer( obs_shape=train_envs[0].observation_space.shape, action_shape=train_envs[0].action_space.shape, capacity=args.replay_buffer_capacity, batch_size=args.batch_size, device=device, ) eval_replay_buffer = utils.ReplayBuffer( obs_shape=train_envs[0].observation_space.shape, action_shape=train_envs[0].action_space.shape, capacity=args.replay_buffer_capacity, batch_size=args.batch_size, device=device, ) logging_dict = { "model_error": [], "decoding_error": [], "eval_model_error": [], "steps": [], } # collect data across environments for env_id in range(args.num_envs): train_replay_buffer = utils.collect_random_data( train_envs[env_id], env_id, args.num_samples, train_replay_buffer, save_video=args.save_video, ) eval_replay_buffer = utils.collect_random_data(eval_envs[env_id], env_id, args.num_samples, eval_replay_buffer) # Train loop for iteration in range(args.num_iters): model_error = 0 decoder_error = 0 for i in range(args.num_envs): obses, actions, rewards, next_obses, not_dones = train_replay_buffer.sample( i) latent = phi(obses) pred_next_latent = model(latent, actions) true_next_latent = phi(next_obses).detach() error_e = F.mse_loss(pred_next_latent, true_next_latent) model_error += error_e if args.one_decoder: pred_next_obses = decoders[0]( pred_next_latent) # only use one decoder else: pred_next_obses = decoders[i](pred_next_latent) decoder_error_e = F.mse_loss(pred_next_obses, next_obses) decoder_error += decoder_error_e opt.zero_grad() model_error.backward(retain_graph=True) opt.step() decoder_opt.zero_grad() decoder_error.backward() decoder_opt.step() if iteration % args.log_interval == 0: with torch.no_grad(): logging_dict["steps"].append(iteration) logging_dict["model_error"].append(model_error.item()) logging_dict["decoding_error"].append(decoder_error.item()) print( f"Iteration {iteration}: Mean train set model error: {model_error.mean()}, decoding error: {decoder_error.mean()}%%" ) # Evaluate on test environment ( obses, actions, rewards, next_obses, not_dones, ) = eval_replay_buffer.sample() with torch.no_grad(): latent = phi(obses) pred_next_latent = model(latent, actions) true_next_latent = phi(next_obses).detach() test_error = F.mse_loss(pred_next_latent, true_next_latent) logging_dict["eval_model_error"].append(test_error.item()) print(f"Mean test set error: {test_error}") torch.save(logging_dict, os.path.join(args.work_dir, "logging_dict.pt"))
def run_hiro(args): if not os.path.exists("./results"): os.makedirs("./results") if args.save_models and not os.path.exists("./pytorch_models"): os.makedirs("./pytorch_models") if not os.path.exists(args.log_dir): os.makedirs(args.log_dir) if not os.path.exists(os.path.join(args.log_dir, args.log_file)): os.makedirs(os.path.join(args.log_dir, args.log_file)) env = gym.make(args.env_name) obs = env.reset() goal = obs['desired_goal'] state = obs['observation'] # # Write Hyperparameters to file # print("---------------------------------------") # print("Current Arguments:") # with open(os.path.join(args.log_dir, args.log_file, "hps.txt"), 'w') as f: # for arg in vars(args): # print("{}: {}".format(arg, getattr(args, arg))) # f.write("{}: {}\n".format(arg, getattr(args, arg))) # print("---------------------------------------\n") writer = SummaryWriter(log_dir=os.path.join(args.log_dir, args.log_file)) # torch.cuda.set_device(0) env_name = type(env).__name__ file_name = 'hiro_{}'.format(env_name) # Set seeds env.seed(args.seed) torch.manual_seed(args.seed) np.random.seed(args.seed) state_dim = state.shape[0] goal_dim = goal.shape[0] action_dim = env.action_space.shape[0] max_action = int(env.action_space.high[0]) # Initialize policy, replay buffers controller_policy = hiro.Controller(state_dim=state_dim, goal_dim=state_dim, action_dim=action_dim, max_action=max_action, actor_lr=args.ctrl_act_lr, critic_lr=args.ctrl_crit_lr, ctrl_rew_type=args.ctrl_rew_type) manager_policy = hiro.Manager(state_dim=state_dim, goal_dim=goal_dim, action_dim=state_dim, actor_lr=args.man_act_lr, critic_lr=args.man_crit_lr, candidate_goals=args.candidate_goals) calculate_controller_reward = hiro_controller_reward if args.noise_type == "ou": man_noise = utils.OUNoise(state_dim, sigma=args.man_noise_sigma) ctrl_noise = utils.OUNoise(action_dim, sigma=args.ctrl_noise_sigma) elif args.noise_type == "normal": man_noise = utils.NormalNoise(sigma=args.man_noise_sigma) ctrl_noise = utils.NormalNoise(sigma=args.ctrl_noise_sigma) manager_buffer = utils.ReplayBuffer(maxsize=args.man_buffer_size) controller_buffer = utils.ReplayBuffer(maxsize=args.ctrl_buffer_size) # Logging Parameters total_timesteps = 0 timesteps_since_eval = 0 timesteps_since_manager = 0 timesteps_since_subgoal = 0 episode_num = 0 done = True evaluations = [] while total_timesteps < args.max_timesteps: if done: if total_timesteps != 0: print('Training Controller...') ctrl_act_loss, ctrl_crit_loss = controller_policy.train( controller_buffer, episode_timesteps, args.ctrl_batch_size, args.discount, args.ctrl_tau) writer.add_scalar('data/controller_actor_loss', ctrl_act_loss, total_timesteps) writer.add_scalar('data/controller_critic_loss', ctrl_crit_loss, total_timesteps) writer.add_scalar('data/controller_ep_rew', episode_reward, total_timesteps) writer.add_scalar('data/manager_ep_rew', manager_transition[4], total_timesteps) # Train Manager if timesteps_since_manager >= args.train_manager_freq: print('Training Manager...') timesteps_since_manager = 0 man_act_loss, man_crit_loss = manager_policy.train( controller_policy, manager_buffer, ceil(episode_timesteps / args.train_manager_freq), args.man_batch_size, args.discount, args.man_tau) writer.add_scalar('data/manager_actor_loss', man_act_loss, total_timesteps) writer.add_scalar('data/manager_critic_loss', man_crit_loss, total_timesteps) # Evaluate episode if timesteps_since_eval >= args.eval_freq: timesteps_since_eval = 0 avg_ep_rew, avg_controller_rew, avg_steps, avg_env_finish = evaluate_policy( env, writer, manager_policy, controller_policy, calculate_controller_reward, args.ctrl_rew_scale, args.manager_propose_freq, len(evaluations)) writer.add_scalar('eval/avg_ep_rew', avg_ep_rew, total_timesteps) writer.add_scalar('eval/avg_controller_rew', avg_controller_rew, total_timesteps) writer.add_scalar('eval/avg_steps_to_finish', avg_steps, total_timesteps) writer.add_scalar('eval/perc_env_goal_achieved', avg_env_finish, total_timesteps) evaluations.append( [avg_ep_rew, avg_controller_rew, avg_steps]) if args.save_models: controller_policy.save(file_name + '_controller', directory="./pytorch_models") manager_policy.save(file_name + '_manager', directory="./pytorch_models") np.save("./results/%s" % (file_name), evaluations) # Process final state/obs, store manager transition, if it was not just created if len(manager_transition[-2]) != 1: manager_transition[1] = state manager_transition[5] = float(True) # Every manager transition should have same length of sequences if len(manager_transition[-2] ) <= args.manager_propose_freq: while len(manager_transition[-2] ) <= args.manager_propose_freq: manager_transition[-1].append(np.inf) manager_transition[-2].append(state) manager_buffer.add(manager_transition) # Reset environment obs = env.reset() goal = obs['desired_goal'] state = obs['observation'] """ obs = env.reset() => {"observation", "achieved_goal", "desired_goal"} (10, ) (3, ) (3, ) goal = obs['desired_goal'] => (3, ) state = obs['observation'] => (10, ) """ done = False episode_reward = 0 episode_timesteps = 0 episode_num += 1 # Create new manager transition subgoal = manager_policy.sample_goal(state, goal) timesteps_since_subgoal = 0 # Create a high level transition manager_transition = [ state, None, goal, subgoal, 0, False, [state], [] ] # TODO: Scale action to environment action = controller_policy.select_action(state, subgoal) action = ctrl_noise.perturb_action(action, max_action) # Perform action, get (nextst, r, d) next_tup, manager_reward, env_done, _ = env.step(action) # Update cumulative reward (env. reward) for manager manager_transition[4] += manager_reward * args.man_rew_scale # Process next_goal = obs['desired_goal'] next_state = obs['observation'] # Append low level sequence for off policy correction manager_transition[-1].append(action) manager_transition[-2].append(next_state) # Calculate reward, transition subgoal controller_reward = calculate_controller_reward( state, subgoal, next_state, args.ctrl_rew_scale) subgoal = controller_policy.subgoal_transition(state, subgoal, next_state) # Is the episode over? if env_done: done = True episode_reward += controller_reward # Store low level transition controller_buffer.add( ( state, next_state, subgoal, \ action, controller_reward, float(done), \ [], [] ) ) # Update state parameters state = next_state goal = next_goal # Update counters episode_timesteps += 1 total_timesteps += 1 timesteps_since_eval += 1 timesteps_since_manager += 1 timesteps_since_subgoal += 1 if timesteps_since_subgoal % args.manager_propose_freq == 0: # Finish, add transition manager_transition[1] = state manager_transition[5] = float(True) manager_buffer.add(manager_transition) subgoal = manager_policy.sample_goal(state, goal) subgoal = man_noise.perturb_action(subgoal, max_action=np.inf) # Reset number of timesteps since we sampled a subgoal timesteps_since_subgoal = 0 # Create a high level transition manager_transition = [ state, None, goal, subgoal, 0, False, [state], [] ] # Final evaluation evaluations.append([ evaluate_policy(env, writer, manager_policy, controller_policy, calculate_controller_reward, args.ctrl_rew_scale, args.manager_propose_freq, len(evaluations)) ]) if args.save_models: controller_policy.save(file_name + '_controller', directory="./pytorch_models") manager_policy.save(file_name + '_manager', directory="./pytorch_models") np.save("./results/%s" % (file_name), evaluations)