def __init__(self, state_dim, action_dim, max_action, vae_latent_dim_multiplicity, target_q_coef, actor_hid_sizes, critic_hid_sizes, vae_e_hid_sizes, vae_d_hid_sizes, encoder_latent_dim, g_hid_sizes, g_latent_dim, h_hid_sizes, E_hid_sizes, P_hid_sizes): vae_latent_dim = vae_latent_dim_multiplicity * action_dim self.actor = Actor(state_dim, action_dim, encoder_latent_dim, actor_hid_sizes, max_action).to(device) self.actor_target = Actor(state_dim, action_dim, encoder_latent_dim, actor_hid_sizes, max_action).to(device) self.actor_target.load_state_dict(self.actor.state_dict()) self.critic = Critic(state_dim, action_dim, encoder_latent_dim, critic_hid_sizes).to(device) self.critic_target = Critic(state_dim, action_dim, encoder_latent_dim, critic_hid_sizes).to(device) self.critic_target.load_state_dict(self.critic.state_dict()) self.vae = VAE(state_dim, action_dim, encoder_latent_dim, vae_latent_dim, vae_e_hid_sizes, vae_d_hid_sizes, max_action).to(device) self.mlp_encoder = MlpEncoder(state_dim, action_dim, encoder_latent_dim, g_hid_sizes, g_latent_dim, h_hid_sizes).to(device) self.E = FlattenMlp( hidden_sizes=E_hid_sizes, input_size=state_dim + action_dim, output_size=state_dim, ) self.P = FlattenMlp( hidden_sizes=P_hid_sizes, input_size=state_dim + encoder_latent_dim, output_size=1, ) self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=3e-4) self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=3e-4) self.vae_optimizer = torch.optim.Adam(self.vae.parameters(), lr=3e-4) self.mlp_encoder_optimizer = torch.optim.Adam( self.mlp_encoder.parameters(), lr=3e-4) self.E_optimizer = torch.optim.Adam(self.E.parameters(), lr=3e-4) self.P_optimizer = torch.optim.Adam(self.P.parameters(), lr=3e-4) self.max_action = max_action self.action_dim = action_dim self.target_q_coef = target_q_coef self._need_to_update_eval_statistics = True self.eval_statistics = OrderedDict()
def __init__(self, index, variant, candidate_size=10): ptu.set_gpu_mode(True) torch.set_num_threads(1) import sys sys.argv = [''] del sys env_max_action = variant['env_max_action'] obs_dim = variant['obs_dim'] action_dim = variant['action_dim'] latent_dim = variant['latent_dim'] vae_latent_dim = 2 * action_dim mlp_enconder_input_size = 2 * obs_dim + action_dim + 1 if variant[ 'use_next_obs_in_context'] else obs_dim + action_dim + 1 mlp_enconder = MlpEncoder(hidden_sizes=[200, 200, 200], input_size=mlp_enconder_input_size, output_size=2 * variant['latent_dim']) self.context_encoder = ProbabilisticContextEncoder( mlp_enconder, variant['latent_dim']) self.Qs = FlattenMlp( hidden_sizes=variant['Qs_hidden_sizes'], input_size=obs_dim + action_dim + latent_dim, output_size=1, ) self.vae_decoder = VaeDecoder( max_action=variant['env_max_action'], hidden_sizes=variant['vae_hidden_sizes'], input_size=obs_dim + vae_latent_dim + latent_dim, output_size=action_dim, ) self.perturbation_generator = PerturbationGenerator( max_action=env_max_action, hidden_sizes=variant['perturbation_hidden_sizes'], input_size=obs_dim + action_dim + latent_dim, output_size=action_dim, ) self.use_next_obs_in_context = variant['use_next_obs_in_context'] self.env = env_producer(variant['domain'], variant['seed']) self.num_evals = variant['num_evals'] self.max_path_length = variant['max_path_length'] self.vae_latent_dim = vae_latent_dim self.candidate_size = variant['candidate_size'] self.env.seed(10 * variant['seed'] + 1234 + index) set_seed(10 * variant['seed'] + 1234 + index) self.env.action_space.np_random.seed(123 + index)
def __init__(self, index, variant, candidate_size=10): ptu.set_gpu_mode(True) torch.set_num_threads(1) import sys sys.argv = [''] del sys env_max_action = variant['env_max_action'] obs_dim = variant['obs_dim'] action_dim = variant['action_dim'] latent_dim = variant['latent_dim'] vae_latent_dim = 2 * action_dim self.f = MlpEncoder( g_hidden_sizes=variant['g_hidden_sizes'], g_input_sizes=obs_dim + action_dim + 1, g_latent_dim=variant['g_latent_dim'], h_hidden_sizes=variant['h_hidden_sizes'], latent_dim=latent_dim, ) self.Qs = FlattenMlp( hidden_sizes=variant['Qs_hidden_sizes'], input_size=obs_dim + action_dim + latent_dim, output_size=1, ) self.vae_decoder = VaeDecoder( max_action=variant['env_max_action'], hidden_sizes=variant['vae_hidden_sizes'], input_size=obs_dim + vae_latent_dim + latent_dim, output_size=action_dim, ) self.perturbation_generator = PerturbationGenerator( max_action=env_max_action, hidden_sizes=variant['perturbation_hidden_sizes'], input_size=obs_dim + action_dim + latent_dim, output_size=action_dim, ) self.env = env_producer(variant['domain'], variant['seed']) self.num_evals = variant['algo_params']['num_evals'] self.max_path_length = variant['max_path_length'] self.vae_latent_dim = vae_latent_dim self.num_trans_context = variant['num_trans_context'] self.candidate_size = variant['candidate_size'] self.seed = variant['seed'] self.index = index self.env.seed(10 * self.seed + 1234 + index) set_seed(10 * self.seed + 1234 + index)
def experiment(variant, bcq_buffers, prev_exp_state=None): # Create the multitask replay buffer based on the buffer list train_buffer = MultiTaskReplayBuffer(bcq_buffers_list=bcq_buffers, ) # create multi-task environment and sample tasks env = env_producer(variant['domain'], variant['seed']) env.reset() obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) # instantiate networks network_ensemble = [] for _ in range(variant['num_network_ensemble']): P = FlattenMlp( hidden_sizes=variant['P_hidden_sizes'], input_size=obs_dim + action_dim, output_size=1, ) network_ensemble.append(P) trainer = SuperQTrainer( env, network_ensemble=network_ensemble, train_goal=variant['train_goal'], std_threshold=variant['std_threshold'], domain=variant['domain'], ) algorithm = BatchMetaRLAlgorithm( trainer, train_buffer, **variant['algo_params'], ) algorithm.to(ptu.device) start_epoch = prev_exp_state['epoch'] + \ 1 if prev_exp_state is not None else 0 algorithm.train(start_epoch)
def q_producer(): return FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=hidden_sizes, )
def experiment(variant, prev_exp_state=None): domain = variant['domain'] seed = variant['seed'] goal = variant['goal'] expl_env = env_producer(domain, seed, goal) env_max_action = float(expl_env.action_space.high[0]) obs_dim = expl_env.observation_space.low.size action_dim = expl_env.action_space.low.size vae_latent_dim = 2 * action_dim mlp_enconder_input_size = 2 * obs_dim + action_dim + 1 print('------------------------------------------------') print('obs_dim', obs_dim) print('action_dim', action_dim) print('------------------------------------------------') # Network module from tiMe mlp_enconder = MlpEncoder(hidden_sizes=[200, 200, 200], input_size=mlp_enconder_input_size, output_size=2 * variant['latent_dim']) context_encoder = ProbabilisticContextEncoder(mlp_enconder, variant['latent_dim']) qf1 = FlattenMlp( hidden_sizes=variant['Qs_hidden_sizes'], input_size=obs_dim + action_dim + variant['latent_dim'], output_size=1, ) target_qf1 = FlattenMlp( hidden_sizes=variant['Qs_hidden_sizes'], input_size=obs_dim + action_dim + variant['latent_dim'], output_size=1, ) qf2 = FlattenMlp( hidden_sizes=variant['Qs_hidden_sizes'], input_size=obs_dim + action_dim + variant['latent_dim'], output_size=1, ) target_qf2 = FlattenMlp( hidden_sizes=variant['Qs_hidden_sizes'], input_size=obs_dim + action_dim + variant['latent_dim'], output_size=1, ) vae_decoder = VaeDecoder( max_action=env_max_action, hidden_sizes=variant['vae_hidden_sizes'], input_size=obs_dim + vae_latent_dim + variant['latent_dim'], output_size=action_dim, ) perturbation_generator = PerturbationGenerator( max_action=env_max_action, hidden_sizes=variant['perturbation_hidden_sizes'], input_size=obs_dim + action_dim + variant['latent_dim'], output_size=action_dim, ) # Load the params obtained by tiMe ss = load_gzip_pickle(variant['path_to_snapshot']) ss = ss['trainer'] encoder_state_dict = OrderedDict() for key, value in ss['context_encoder_state_dict'].items(): if 'mlp_encoder' in key: encoder_state_dict[key.replace('mlp_encoder.', '')] = value mlp_enconder.load_state_dict(encoder_state_dict) qf1.load_state_dict(ss['Qs_state_dict']) target_qf1.load_state_dict(ss['Qs_state_dict']) qf2.load_state_dict(ss['Qs_state_dict']) target_qf2.load_state_dict(ss['Qs_state_dict']) vae_decoder.load_state_dict(ss['vae_decoder_state_dict']) perturbation_generator.load_state_dict(ss['perturbation_generator_dict']) tiMe_path_collector = tiMeSampler( expl_env, context_encoder, qf1, vae_decoder, perturbation_generator, vae_latent_dim=vae_latent_dim, candidate_size=variant['candidate_size'], ) tiMe_path_collector.to(ptu.device) # Get producer function for policy policy_producer = get_policy_producer( obs_dim, action_dim, hidden_sizes=variant['policy_hidden_sizes']) # Finished getting producer remote_eval_path_collector = RemoteMdpPathCollector.remote( domain, seed * 10 + 1, goal, policy_producer) expl_path_collector = MdpPathCollector(expl_env, ) replay_buffer = ReplayBuffer(variant['replay_buffer_size'], ob_space=expl_env.observation_space, action_space=expl_env.action_space) trainer = SACTrainer(policy_producer, qf1=qf1, target_qf1=target_qf1, qf2=qf2, target_qf2=target_qf2, action_space=expl_env.action_space, **variant['trainer_kwargs']) algorithm = BatchRLAlgorithm( trainer=trainer, exploration_data_collector=expl_path_collector, remote_eval_data_collector=remote_eval_path_collector, tiMe_data_collector=tiMe_path_collector, replay_buffer=replay_buffer, optimistic_exp_hp=variant['optimistic_exp'], **variant['algorithm_kwargs']) algorithm.to(ptu.device) start_epoch = prev_exp_state['epoch'] + \ 1 if prev_exp_state is not None else 0 algorithm.train(start_epoch)
class RemotePathCollectorSingleMdp(object): def __init__(self, index, variant, candidate_size=10): ptu.set_gpu_mode(True) torch.set_num_threads(1) import sys sys.argv = [''] del sys env_max_action = variant['env_max_action'] obs_dim = variant['obs_dim'] action_dim = variant['action_dim'] latent_dim = variant['latent_dim'] vae_latent_dim = 2 * action_dim mlp_enconder_input_size = 2 * obs_dim + action_dim + 1 if variant[ 'use_next_obs_in_context'] else obs_dim + action_dim + 1 mlp_enconder = MlpEncoder(hidden_sizes=[200, 200, 200], input_size=mlp_enconder_input_size, output_size=2 * variant['latent_dim']) self.context_encoder = ProbabilisticContextEncoder( mlp_enconder, variant['latent_dim']) self.Qs = FlattenMlp( hidden_sizes=variant['Qs_hidden_sizes'], input_size=obs_dim + action_dim + latent_dim, output_size=1, ) self.vae_decoder = VaeDecoder( max_action=variant['env_max_action'], hidden_sizes=variant['vae_hidden_sizes'], input_size=obs_dim + vae_latent_dim + latent_dim, output_size=action_dim, ) self.perturbation_generator = PerturbationGenerator( max_action=env_max_action, hidden_sizes=variant['perturbation_hidden_sizes'], input_size=obs_dim + action_dim + latent_dim, output_size=action_dim, ) self.use_next_obs_in_context = variant['use_next_obs_in_context'] self.env = env_producer(variant['domain'], variant['seed']) self.num_evals = variant['num_evals'] self.max_path_length = variant['max_path_length'] self.vae_latent_dim = vae_latent_dim self.candidate_size = variant['candidate_size'] self.env.seed(10 * variant['seed'] + 1234 + index) set_seed(10 * variant['seed'] + 1234 + index) self.env.action_space.np_random.seed(123 + index) def async_evaluate(self, goal): self.env.set_goal(goal) self.context_encoder.clear_z() avg_reward = 0. avg_achieved = [] final_achieved = [] raw_context = deque() for i in range(self.num_evals): # Sample MDP indentity self.context_encoder.sample_z() inferred_mdp = self.context_encoder.z obs = self.env.reset() done = False path_length = 0 while not done and path_length < self.max_path_length: action = self.select_actions(np.array(obs), inferred_mdp) next_obs, reward, done, env_info = self.env.step(action) avg_achieved.append(env_info['achieved']) if self.use_next_obs_in_context: new_context = np.concatenate([ obs.reshape(1, -1), action.reshape(1, -1), next_obs.reshape(1, -1), np.array(reward).reshape(1, -1) ], axis=1) else: assert False new_context = np.concatenate([ obs.reshape(1, -1), action.reshape(1, -1), np.array(reward).reshape(1, -1) ], axis=1) raw_context.append(new_context) obs = next_obs.copy() if i > 1: avg_reward += reward path_length += 1 context = from_numpy(np.concatenate(raw_context, axis=0))[None] self.context_encoder.infer_posterior(context) if i > 1: final_achieved.append(env_info['achieved']) avg_reward /= (self.num_evals - 2) if np.isscalar(env_info['achieved']): avg_achieved = np.mean(avg_achieved) final_achieved = np.mean(final_achieved) else: avg_achieved = np.stack(avg_achieved) avg_achieved = np.mean(avg_achieved, axis=0) final_achieved = np.stack(final_achieved) final_achieved = np.mean(final_achieved, axis=0) print(avg_reward) return avg_reward, (final_achieved.tolist(), self.env._goal.tolist()) def async_evaluate_test(self, goal): self.env.set_goal(goal) self.context_encoder.clear_z() avg_reward_list = [] online_achieved_list = [] raw_context = deque() for _ in range(self.num_evals): # Sample MDP indentity self.context_encoder.sample_z() inferred_mdp = self.context_encoder.z obs = self.env.reset() done = False path_length = 0 avg_reward = 0. online_achieved = [] while not done and path_length < self.max_path_length: action = self.select_actions(np.array(obs), inferred_mdp) next_obs, reward, done, env_info = self.env.step(action) achieved = env_info['achieved'] online_achieved.append(np.arctan(achieved[1] / achieved[0])) if self.use_next_obs_in_context: new_context = np.concatenate([ obs.reshape(1, -1), action.reshape(1, -1), next_obs.reshape(1, -1), np.array(reward).reshape(1, -1) ], axis=1) else: new_context = np.concatenate([ obs.reshape(1, -1), action.reshape(1, -1), np.array(reward).reshape(1, -1) ], axis=1) raw_context.append(new_context) obs = next_obs.copy() avg_reward += reward path_length += 1 avg_reward_list.append(avg_reward) online_achieved = np.array(online_achieved) online_achieved_list.append([ online_achieved.mean(), online_achieved.std(), self.env._goal ]) context = from_numpy(np.concatenate(raw_context, axis=0))[None] self.context_encoder.infer_posterior(context) return online_achieved_list def set_network_params(self, params_list): ''' The shipped params are in cpu here. This function will set the params of the sampler's networks using the params in the params_list and ship them to gpu. ''' context_encoder_params, Qs_params, vae_params, perturbation_params = params_list self.context_encoder.mlp_encoder.set_param_values( context_encoder_params) self.context_encoder.mlp_encoder.to(ptu.device) self.Qs.set_param_values(Qs_params) self.Qs.to(ptu.device) self.vae_decoder.set_param_values(vae_params) self.vae_decoder.to(ptu.device) self.perturbation_generator.set_param_values(perturbation_params) self.perturbation_generator.to(ptu.device) def select_actions(self, obs, inferred_mdp): # Repeat the obs as what BCQ has done, # candidate_size here indicates how many # candidate actions we need. obs = from_numpy(np.tile(obs.reshape(1, -1), (self.candidate_size, 1))) with torch.no_grad(): inferred_mdp = inferred_mdp.repeat(self.candidate_size, 1) z = from_numpy( np.random.normal(0, 1, size=(obs.size(0), self.vae_latent_dim))).clamp( -0.5, 0.5).to(ptu.device) candidate_actions = self.vae_decoder(obs, z, inferred_mdp) perturbed_actions = self.perturbation_generator.get_perturbed_actions( obs, candidate_actions, inferred_mdp) qv = self.Qs(obs, perturbed_actions, inferred_mdp) ind = qv.max(0)[1] return ptu.get_numpy(perturbed_actions[ind])
def experiment(variant, bcq_policies, bcq_buffers, ensemble_params_list, prev_exp_state=None): # Create the multitask replay buffer based on the buffer list train_buffer = MultiTaskReplayBuffer(bcq_buffers_list=bcq_buffers, ) # create multi-task environment and sample tasks env = env_producer(variant['domain'], variant['seed']) env_max_action = float(env.action_space.high[0]) obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) vae_latent_dim = 2 * action_dim mlp_enconder_input_size = 2 * obs_dim + action_dim + 1 if variant[ 'use_next_obs_in_context'] else obs_dim + action_dim + 1 variant['env_max_action'] = env_max_action variant['obs_dim'] = obs_dim variant['action_dim'] = action_dim variant['mlp_enconder_input_size'] = mlp_enconder_input_size # instantiate networks mlp_enconder = MlpEncoder(hidden_sizes=[200, 200, 200], input_size=mlp_enconder_input_size, output_size=2 * variant['latent_dim']) context_encoder = ProbabilisticContextEncoder(mlp_enconder, variant['latent_dim']) ensemble_predictor = EnsemblePredictor(ensemble_params_list) Qs = FlattenMlp( hidden_sizes=variant['Qs_hidden_sizes'], input_size=obs_dim + action_dim + variant['latent_dim'], output_size=1, ) vae_decoder = VaeDecoder( max_action=env_max_action, hidden_sizes=variant['vae_hidden_sizes'], input_size=obs_dim + vae_latent_dim + variant['latent_dim'], output_size=action_dim, ) perturbation_generator = PerturbationGenerator( max_action=env_max_action, hidden_sizes=variant['perturbation_hidden_sizes'], input_size=obs_dim + action_dim + variant['latent_dim'], output_size=action_dim, ) trainer = SuperQTrainer( ensemble_predictor=ensemble_predictor, num_network_ensemble=variant['num_network_ensemble'], bcq_policies=bcq_policies, std_threshold=variant['std_threshold'], is_combine=variant['is_combine'], nets=[context_encoder, Qs, vae_decoder, perturbation_generator]) path_collector = RemotePathCollector(variant) algorithm = BatchMetaRLAlgorithm( trainer, path_collector, train_buffer, **variant['algo_params'], ) algorithm.to(ptu.device) start_epoch = prev_exp_state['epoch'] + \ 1 if prev_exp_state is not None else 0 # Log the variant logger.log("Variant:") logger.log(json.dumps(dict_to_safe_json(variant), indent=2)) algorithm.train(start_epoch)
class BCQ(object): def __init__(self, state_dim, action_dim, max_action, vae_latent_dim_multiplicity, target_q_coef, actor_hid_sizes, critic_hid_sizes, vae_e_hid_sizes, vae_d_hid_sizes, encoder_latent_dim, g_hid_sizes, g_latent_dim, h_hid_sizes, E_hid_sizes, P_hid_sizes): vae_latent_dim = vae_latent_dim_multiplicity * action_dim self.actor = Actor(state_dim, action_dim, encoder_latent_dim, actor_hid_sizes, max_action).to(device) self.actor_target = Actor(state_dim, action_dim, encoder_latent_dim, actor_hid_sizes, max_action).to(device) self.actor_target.load_state_dict(self.actor.state_dict()) self.critic = Critic(state_dim, action_dim, encoder_latent_dim, critic_hid_sizes).to(device) self.critic_target = Critic(state_dim, action_dim, encoder_latent_dim, critic_hid_sizes).to(device) self.critic_target.load_state_dict(self.critic.state_dict()) self.vae = VAE(state_dim, action_dim, encoder_latent_dim, vae_latent_dim, vae_e_hid_sizes, vae_d_hid_sizes, max_action).to(device) self.mlp_encoder = MlpEncoder(state_dim, action_dim, encoder_latent_dim, g_hid_sizes, g_latent_dim, h_hid_sizes).to(device) self.E = FlattenMlp( hidden_sizes=E_hid_sizes, input_size=state_dim + action_dim, output_size=state_dim, ) self.P = FlattenMlp( hidden_sizes=P_hid_sizes, input_size=state_dim + encoder_latent_dim, output_size=1, ) self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=3e-4) self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=3e-4) self.vae_optimizer = torch.optim.Adam(self.vae.parameters(), lr=3e-4) self.mlp_encoder_optimizer = torch.optim.Adam( self.mlp_encoder.parameters(), lr=3e-4) self.E_optimizer = torch.optim.Adam(self.E.parameters(), lr=3e-4) self.P_optimizer = torch.optim.Adam(self.P.parameters(), lr=3e-4) self.max_action = max_action self.action_dim = action_dim self.target_q_coef = target_q_coef self._need_to_update_eval_statistics = True self.eval_statistics = OrderedDict() def get_perturbation(self, state, action): perturbation = self.actor.get_perturbation(state, action) return perturbation def select_action(self, state): with torch.no_grad(): state = torch.FloatTensor(state.reshape(1, -1)).repeat(10, 1).to(device) action = self.actor(state, self.vae.decode(state)) q1 = self.critic.q1(state, action) ind = q1.max(0)[1] return action[ind].cpu().data.numpy().flatten() def train(self, train_data, discount=0.99, tau=0.005): # Sample replay buffer / batch state_np, next_state_np, action, reward, done, context = train_data state = torch.FloatTensor(state_np).to(device) action = torch.FloatTensor(action).to(device) next_state = torch.FloatTensor(next_state_np).to(device) reward = torch.FloatTensor(reward).to(device) done = torch.FloatTensor(1 - done).to(device) context = torch.FloatTensor(context).to(device) gt.stamp('unpack_data', unique=False) # Infer mdep identity using context # inferred_mdp = self.mlp_encoder(context) # in_mdp_batch_size = state.shape[0] // context.shape[0] # inferred_mdp = torch.repeat_interleave(inferred_mdp, in_mdp_batch_size, dim=0) # gt.stamp('infer_mdp_identity', unique=False) # Train the mlp encoder to predict the rewards. # self.mlp_encoder.zero_grad() # pred_next_obs = self.E(state, action) # pred_rewards = self.P(pred_next_obs, inferred_mdp) # reward_loss = F.mse_loss(pred_rewards, reward) # gt.stamp('get_reward_loss', unique=False) # reward_loss.backward(retain_graph=True) # gt.stamp('get_reward_gradient', unique=False) # Extend the state space using the inferred_mdp # state = torch.cat([state, inferred_mdp], dim=1) # next_state = torch.cat([next_state, inferred_mdp], dim=1) # gt.stamp('extend_original_state', unique=False) # Critic Training self.critic_optimizer.zero_grad() with torch.no_grad(): # Duplicate state 10 times state_rep = next_state.repeat_interleave(10, dim=0) gt.stamp('check0', unique=False) # candidate_action = self.vae.decode(state_rep) # torch.cuda.synchronize() # gt.stamp('check1', unique=False) # perturbated_action = self.actor_target(state_rep, candidate_action) # torch.cuda.synchronize() # gt.stamp('check2', unique=False) # target_Q1, target_Q2 = self.critic_target(state_rep, perturbated_action) # torch.cuda.synchronize() # gt.stamp('check3', unique=False) target_Q1, target_Q2 = self.critic_target( state_rep, self.actor_target(state_rep, self.vae.decode(state_rep))) # Soft Clipped Double Q-learning target_Q = self.target_q_coef * torch.min(target_Q1, target_Q2) + ( 1 - self.target_q_coef) * torch.max(target_Q1, target_Q2) target_Q = target_Q.view(state.shape[0], -1).max(1)[0].view(-1, 1) target_Q = reward + done * discount * target_Q current_Q1, current_Q2 = self.critic(state, action) critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss( current_Q2, target_Q) gt.stamp('get_critic_loss', unique=False) critic_loss.backward() # retain_graph=True gt.stamp('get_critic_gradient', unique=False) # self.mlp_encoder_optimizer.step() # gt.stamp('update_mlp_encoder', unique=False) # Variational Auto-Encoder Training recon, mean, std = self.vae(state, action) recon_loss = F.mse_loss(recon, action) KL_loss = -0.5 * (1 + torch.log(std.pow(2)) - mean.pow(2) - std.pow(2)).mean() vae_loss = recon_loss + 0.5 * KL_loss gt.stamp('get_vae_loss', unique=False) self.vae_optimizer.zero_grad() vae_loss.backward() self.vae_optimizer.step() gt.stamp('update_vae', unique=False) self.critic_optimizer.step() gt.stamp('update_critic', unique=False) # Pertubation Model / Action Training sampled_actions = self.vae.decode(state) perturbed_actions = self.actor(state, sampled_actions) # Update through DPG self.actor_optimizer.zero_grad() actor_loss = -self.critic.q1(state, perturbed_actions).mean() gt.stamp('get_actor_loss', unique=False) self.actor_optimizer.step() gt.stamp('update_actor', unique=False) # Update Target Networks for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()): target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data) for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()): target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data) gt.stamp('update_target_network', unique=False) """ Save some statistics for eval """ if self._need_to_update_eval_statistics: self._need_to_update_eval_statistics = False """ Eval should set this to None. This way, these statistics are only computed for one batch. """ self.eval_statistics['actor_loss'] = np.mean(get_numpy(actor_loss)) self.eval_statistics['critic_loss'] = np.mean( get_numpy(critic_loss)) self.eval_statistics['vae_loss'] = np.mean(get_numpy(vae_loss)) # self.eval_statistics['reward_loss'] = np.mean( # get_numpy(reward_loss) # ) def get_diagnostics(self): return self.eval_statistics def end_epoch(self, epoch): self._need_to_update_eval_statistics = True @property def networks(self): return [ self.actor, self.critic, self.vae, self.mlp_encoder, self.E, self.P ] @property def eval_networks(self): ''' Return networks for the policy evaluation ''' return [self.actor, self.critic, self.vae, self.mlp_encoder] def get_snapshot(self): return dict( actor_dict=self.actor.state_dict(), critic_dict=self.critic.state_dict(), vae_dict=self.vae.state_dict(), mlp_encoder_dict=self.mlp_encoder.state_dict(), E_state_dict=self.E.state_dict(), P_state_dict=self.P.state_dict(), joint_optimizer_state_dict=self.joint_optimizer.state_dict(), eval_statistics=self.eval_statistics, _need_to_update_eval_statistics=self. _need_to_update_eval_statistics)
class PathCollectorSingleMdp(object): def __init__(self, variant, goal, candidate_size=10): ptu.set_gpu_mode(True) import sys sys.argv = [''] del sys env_max_action = variant['env_max_action'] obs_dim = variant['obs_dim'] action_dim = variant['action_dim'] latent_dim = variant['latent_dim'] vae_latent_dim = 2 * action_dim self.f = MlpEncoder( g_hidden_sizes=variant['g_hidden_sizes'], g_input_sizes=obs_dim + action_dim + 1, g_latent_dim=variant['g_latent_dim'], h_hidden_sizes=variant['h_hidden_sizes'], latent_dim=latent_dim, ) self.Qs = FlattenMlp( hidden_sizes=variant['Qs_hidden_sizes'], input_size=obs_dim + action_dim + latent_dim, output_size=1, ) self.vae_decoder = VaeDecoder( max_action=variant['env_max_action'], hidden_sizes=variant['vae_hidden_sizes'], input_size=obs_dim + vae_latent_dim + latent_dim, output_size=action_dim, ) self.perturbation_generator = PerturbationGenerator( max_action=env_max_action, hidden_sizes=variant['perturbation_hidden_sizes'], input_size=obs_dim + action_dim + latent_dim, output_size=action_dim, ) self.env = env_producer(variant['domain'], variant['seed'], goal) self.num_evals = variant['algo_params']['num_evals'] self.max_path_length = variant['algo_params']['max_path_length'] self.vae_latent_dim = vae_latent_dim self.num_trans_context = variant['num_trans_context'] self.candidate_size = variant['candidate_size'] def async_evaluate(self, params_list, goal=None): if goal is not None: self.env.set_goal(goal) self.set_network_params(params_list) avg_reward = 0. avg_achieved = [] final_achieved = [] for _ in range(self.num_evals): obs = self.env.reset() done = False path_length = 0 raw_context = deque() while not done and path_length < self.max_path_length: action = self.select_actions(np.array(obs), raw_context) next_obs, reward, done, env_info = self.env.step(action) avg_achieved.append(env_info['achieved']) raw_context.append( np.concatenate([ obs.reshape(1, -1), action.reshape(1, -1), np.array(reward).reshape(1, -1) ], axis=1)) print(env_info['achieved']) obs = next_obs.copy() avg_reward += reward path_length += 1 final_achieved.append(env_info['achieved']) avg_reward /= self.num_evals if np.isscalar(env_info['achieved']): avg_achieved = np.mean(avg_achieved) final_achieved = np.mean(final_achieved) else: # avg_achieved = np.stack(avg_achieved) # avg_achieved = np.mean(avg_achieved, axis=0) final_achieved = np.stack(final_achieved) final_achieved = np.mean(final_achieved, axis=0) return avg_reward, (final_achieved.tolist(), self.env._goal.tolist()) # return avg_reward, (avg_achieved, self.env._goal), (final_achieved, self.env._goal) def get_rollout(self, goal=None, bcq_policy=None): if goal is not None: self.env.set_goal(goal) obs = self.env.reset() done = False path_length = 0 avg_reward = 0. traj = [] raw_context = deque() while not done and path_length < self.max_path_length: if bcq_policy is not None and path_length < 20: # print(obs[:2]) action = bcq_policy.select_action(obs) else: # print(obs[:2]) action = self.select_actions(np.array(obs), raw_context) action = self.select_actions(np.array(obs), raw_context) next_obs, reward, done, env_info = self.env.step(action) traj.append([obs, next_obs, action, reward, raw_context, env_info]) raw_context.append( np.concatenate([ obs.reshape(1, -1), action.reshape(1, -1), np.array(reward).reshape(1, -1) ], axis=1)) obs = next_obs.copy() path_length += 1 avg_reward += reward print(avg_reward) return traj def set_network_params(self, params_list): ''' The shipped params are in cpu here. This function will set the params of the sampler's networks using the params in the params_list and ship them to gpu. ''' f_params, Qs_params, vae_params, perturbation_params = params_list self.f.set_param_values(f_params) self.f.to(ptu.device) self.Qs.set_param_values(Qs_params) self.Qs.to(ptu.device) self.vae_decoder.set_param_values(vae_params) self.vae_decoder.to(ptu.device) self.perturbation_generator.set_param_values(perturbation_params) self.perturbation_generator.to(ptu.device) def select_actions(self, obs, raw_context): # Repeat the obs as what BCQ has done, # candidate_size here indicates how many # candidate actions we need. obs = from_numpy(np.tile(obs.reshape(1, -1), (self.candidate_size, 1))) if len(raw_context) == 0: # In the beginning, the inferred_mdp is set to zero vector. inferred_mdp = ptu.zeros((1, self.f.latent_dim)) else: # Construct the context from raw context context = from_numpy(np.concatenate(raw_context, axis=0))[None] inferred_mdp = self.f(context) with torch.no_grad(): inferred_mdp = inferred_mdp.repeat(self.candidate_size, 1) z = from_numpy( np.random.normal(0, 1, size=(obs.size(0), self.vae_latent_dim))).clamp( -0.5, 0.5).to(ptu.device) candidate_actions = self.vae_decoder(obs, z, inferred_mdp) perturbed_actions = self.perturbation_generator.get_perturbed_actions( obs, candidate_actions, inferred_mdp) qv = self.Qs(obs, perturbed_actions, inferred_mdp) ind = qv.max(0)[1] return ptu.get_numpy(perturbed_actions[ind])
def experiment(variant, prev_exp_state=None): domain = variant['domain'] seed = variant['seed'] goal = variant['goal'] expl_env = env_producer(domain, seed, goal) obs_dim = expl_env.observation_space.low.size action_dim = expl_env.action_space.low.size print('------------------------------------------------') print('obs_dim', obs_dim) print('action_dim', action_dim) print('------------------------------------------------') qf1 = FlattenMlp( hidden_sizes=variant['Qs_hidden_sizes'], input_size=obs_dim + action_dim, output_size=1, ) target_qf1 = FlattenMlp( hidden_sizes=variant['Qs_hidden_sizes'], input_size=obs_dim + action_dim, output_size=1, ) qf2 = FlattenMlp( hidden_sizes=variant['Qs_hidden_sizes'], input_size=obs_dim + action_dim, output_size=1, ) target_qf2 = FlattenMlp( hidden_sizes=variant['Qs_hidden_sizes'], input_size=obs_dim + action_dim, output_size=1, ) # Get producer function for policy policy_producer = get_policy_producer( obs_dim, action_dim, hidden_sizes=variant['policy_hidden_sizes']) # Finished getting producer remote_eval_path_collector = RemoteMdpPathCollector.remote( domain, seed * 10 + 1, goal, policy_producer) expl_path_collector = MdpPathCollector(expl_env, ) replay_buffer = ReplayBuffer(variant['replay_buffer_size'], ob_space=expl_env.observation_space, action_space=expl_env.action_space) trainer = SACTrainer(policy_producer, qf1=qf1, target_qf1=target_qf1, qf2=qf2, target_qf2=target_qf2, action_space=expl_env.action_space, **variant['trainer_kwargs']) algorithm = BatchRLAlgorithm( trainer=trainer, exploration_data_collector=expl_path_collector, remote_eval_data_collector=remote_eval_path_collector, replay_buffer=replay_buffer, optimistic_exp_hp=variant['optimistic_exp'], **variant['algorithm_kwargs']) algorithm.to(ptu.device) start_epoch = prev_exp_state['epoch'] + \ 1 if prev_exp_state is not None else 0 algorithm.train(start_epoch)