def _init_model(self): """init model from parameters""" self.num_states = self.env.observation_space.shape[0] self.num_actions = self.env.action_space.shape[0] self.action_low, self.action_high = self.env.action_space.low[ 0], self.env.action_space.high[0] # seeding np.random.seed(self.seed) torch.manual_seed(self.seed) self.env.seed(self.seed) self.policy_net = Actor(self.num_states, self.num_actions, self.action_high).to(device) self.policy_net_target = Actor(self.num_states, self.num_actions, self.action_high).to(device) self.value_net = Value(self.num_states + self.num_actions).to(device) self.value_net_target = Value(self.num_states + self.num_actions).to(device) self.policy_net_target.load_state_dict(self.policy_net.state_dict()) self.value_net_target.load_state_dict(self.value_net.state_dict()) self.optimizer_p = optim.Adam(self.policy_net.parameters(), lr=self.lr_p) self.optimizer_v = optim.Adam(self.value_net.parameters(), lr=self.lr_v)
def __init__(self, args, state_dim, action_dim, is_dict_action=False, is_atari=False): self.device = args.device self.config = args if is_atari: self.actor = CNNPolicy(state_dim, action_dim).to(self.device) self.critic = CNNCritic(state_dim).to(self.device) else: self.actor = DiscretePolicy(state_dim, action_dim).to(self.device) if is_dict_action else \ Policy(state_dim, action_dim, log_std=self.config.log_std).to(self.device) self.critic = Value(state_dim).to(self.device) # initialize optimizer for actor and critic self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=self.config.learning_rate) self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=self.config.learning_rate) # optimization epoch number and batch size for PPO self.optim_epochs = 10 self.optim_batch_size = 64
def _init_model(self): """init model from parameters""" self.num_states = self.env.observation_space.shape[0] self.num_actions = self.env.action_space.shape[0] self.action_low, self.action_high = self.env.action_space.low[0], self.env.action_space.high[0] self.target_entropy = - np.prod(self.env.action_space.shape) # seeding np.random.seed(self.seed) torch.manual_seed(self.seed) self.env.seed(self.seed) self.policy_net = Actor(self.num_states, self.num_actions, action_limit=self.action_high).to(device) self.q_net_1 = Value(self.num_states + self.num_actions).to(device) self.q_net_target_1 = Value(self.num_states + self.num_actions).to(device) self.q_net_2 = Value(self.num_states + self.num_actions).to(device) self.q_net_target_2 = Value(self.num_states + self.num_actions).to(device) # self.alpha init self.alpha = torch.exp(torch.zeros(1, device=device)).requires_grad_() self.q_net_target_1.load_state_dict(self.q_net_1.state_dict()) self.q_net_target_2.load_state_dict(self.q_net_2.state_dict()) self.optimizer_p = optim.Adam(self.policy_net.parameters(), lr=self.lr_p) self.optimizer_a = optim.Adam([self.alpha], lr=self.lr_a) self.optimizer_q_1 = optim.Adam(self.q_net_1.parameters(), lr=self.lr_q) self.optimizer_q_2 = optim.Adam(self.q_net_2.parameters(), lr=self.lr_q)
class TestValue(TestCase): def setUp(self) -> None: self.value = Value(6, drop_rate=0.5) self.value2 = Value(11, drop_rate=0.5) print(self.value) def test_forward(self): res = self.value.forward(torch.rand((5, 6))) self.assertEqual(res.size(), torch.Size([5, 1])) def test_multi_forward(self): x1 = torch.rand((5, 6)) x2 = torch.rand((5, 5)) res = self.value2.forward(x1, x2) print(res)
def _init_model(self): self.V = Value(num_states=self.config["value"]["num_states"], num_hiddens=self.config["value"]["num_hiddens"], drop_rate=self.config["value"]["drop_rate"], activation=self.config["value"]["activation"]) self.P = JointPolicy( initial_state=self.expert_dataset.state.to(device), config=self.config["jointpolicy"]) self.D = Discriminator( num_states=self.config["discriminator"]["num_states"], num_actions=self.config["discriminator"]["num_actions"], num_hiddens=self.config["discriminator"]["num_hiddens"], drop_rate=self.config["discriminator"]["drop_rate"], use_noise=self.config["discriminator"]["use_noise"], noise_std=self.config["discriminator"]["noise_std"], activation=self.config["discriminator"]["activation"]) print("Model Structure") print(self.P) print(self.V) print(self.D) print() self.optimizer_policy = optim.Adam( self.P.parameters(), lr=self.config["jointpolicy"]["learning_rate"]) self.optimizer_value = optim.Adam( self.V.parameters(), lr=self.config["value"]["learning_rate"]) self.optimizer_discriminator = optim.Adam( self.D.parameters(), lr=self.config["discriminator"]["learning_rate"]) self.scheduler_discriminator = optim.lr_scheduler.StepLR( self.optimizer_discriminator, step_size=2000, gamma=0.95) self.discriminator_func = nn.BCELoss() to_device(self.V, self.P, self.D, self.D, self.discriminator_func)
def __init__(self, state_dim, channels, kernel_sizes, strides, paddings=None, head_hidden_size=(128, 128), num_aux=0, activation='relu', use_maxpool=False, resnet_first_layer=False): super().__init__(state_dim, 1, channels, kernel_sizes, strides, paddings, activation, use_maxpool, num_aux, resnet_first_layer) self.head = Value(self.conv_out_size_for_fc + num_aux, head_hidden_size, activation)
def create_networks(): """define actor and critic""" if is_disc_action: policy_net = DiscretePolicy(state_dim, env.action_space.n, hidden_size=(64, 32), activation='relu') else: policy_net = Policy(state_dim, env.action_space.shape[0], log_std=args.log_std, hidden_size=(64, 32), activation='relu') value_net = Value(state_dim, hidden_size=(32, 16), activation='relu') if args.WGAN: discrim_net = SNDiscriminator(state_dim + action_dim, hidden_size=(32, 16), activation='relu') elif args.EBGAN or args.GMMIL: discrim_net = AESNDiscriminator(state_dim + action_dim, hidden_size=(32, ), encode_size=64, activation='relu', slope=0.1, dropout=False, dprob=0.2) elif args.GEOMGAN: # new kernel #discrim_net = KernelNet(state_dim + action_dim,state_dim + action_dim) noise_dim = 64 discrim_net = AESNDiscriminator(state_dim + action_dim, hidden_size=(32, ), encode_size=noise_dim, activation='relu', slope=0.1, dropout=False, dprob=0.2) kernel_net = NoiseNet(noise_dim, hidden_size=(32, ), encode_size=noise_dim, activation='relu', slope=0.1, dropout=False, dprob=0.2) optimizer_kernel = torch.optim.Adam(kernel_net.parameters(), lr=args.learning_rate / 2) scheduler_kernel = MultiStepLR(optimizer_kernel, milestones=args.milestones, gamma=args.lr_decay) else: discrim_net = Discriminator(state_dim + action_dim, hidden_size=(32, 16), activation='relu') optimizer_policy = torch.optim.Adam(policy_net.parameters(), lr=args.learning_rate) optimizer_value = torch.optim.Adam(value_net.parameters(), lr=args.learning_rate) optimizer_discrim = torch.optim.Adam(discrim_net.parameters(), lr=args.learning_rate) scheduler_policy = MultiStepLR(optimizer_policy, milestones=args.milestones, gamma=args.lr_decay) scheduler_value = MultiStepLR(optimizer_value, milestones=args.milestones, gamma=args.lr_decay) scheduler_discrim = MultiStepLR(optimizer_discrim, milestones=args.milestones, gamma=args.lr_decay) if args.WGAN: class ExpertReward(): def __init__(self): self.a = 0 def expert_reward(self, state, action): state_action = tensor(np.hstack([state, action]), dtype=dtype) with torch.no_grad(): return -discrim_net(state_action)[0].item() # return -discrim_net(state_action).sum().item() learned_reward = ExpertReward() elif args.EBGAN: class ExpertReward(): def __init__(self): self.a = 0 def expert_reward(self, state, action): state_action = tensor(np.hstack([state, action]), dtype=dtype) with torch.no_grad(): _, recon_out = discrim_net(state_action) return -elementwise_loss( recon_out, state_action).item() + args.r_margin learned_reward = ExpertReward() elif args.GMMIL or args.GEOMGAN: class ExpertReward(): def __init__(self): self.r_bias = 0 def expert_reward(self, state, action): with torch.no_grad(): return self.r_bias def update_XX_YY(self): self.XX = torch.diag(torch.mm(self.e_o_enc, self.e_o_enc.t())) self.YY = torch.diag(torch.mm(self.g_o_enc, self.g_o_enc.t())) learned_reward = ExpertReward() else: class ExpertReward(): def __init__(self): self.a = 0 def expert_reward(self, state, action): state_action = tensor(np.hstack([state, action]), dtype=dtype) with torch.no_grad(): return -math.log(discrim_net(state_action)[0].item()) learned_reward = ExpertReward() """create agent""" agent = Agent(env, policy_net, device, custom_reward=learned_reward, running_state=None, render=args.render, num_threads=args.num_threads) def update_params(batch, i_iter): dataSize = min(args.min_batch_size, len(batch.state)) states = torch.from_numpy(np.stack( batch.state)[:dataSize, ]).to(dtype).to(device) actions = torch.from_numpy(np.stack( batch.action)[:dataSize, ]).to(dtype).to(device) rewards = torch.from_numpy(np.stack( batch.reward)[:dataSize, ]).to(dtype).to(device) masks = torch.from_numpy(np.stack( batch.mask)[:dataSize, ]).to(dtype).to(device) with torch.no_grad(): values = value_net(states) fixed_log_probs = policy_net.get_log_prob(states, actions) """estimate reward""" """get advantage estimation from the trajectories""" advantages, returns = estimate_advantages(rewards, masks, values, args.gamma, args.tau, device) """update discriminator""" for _ in range(args.discriminator_epochs): #dataSize = states.size()[0] # expert_state_actions = torch.from_numpy(expert_traj).to(dtype).to(device) exp_idx = random.sample(range(expert_traj.shape[0]), dataSize) expert_state_actions = torch.from_numpy( expert_traj[exp_idx, :]).to(dtype).to(device) dis_input_real = expert_state_actions if len(actions.shape) == 1: actions.unsqueeze_(-1) dis_input_fake = torch.cat([states, actions], 1) actions.squeeze_(-1) else: dis_input_fake = torch.cat([states, actions], 1) if args.EBGAN or args.GMMIL or args.GEOMGAN: # tbd, no discriminaotr learning pass else: g_o = discrim_net(dis_input_fake) e_o = discrim_net(dis_input_real) optimizer_discrim.zero_grad() if args.GEOMGAN: optimizer_kernel.zero_grad() if args.WGAN: if args.LSGAN: pdist = l1dist(dis_input_real, dis_input_fake).mul(args.lamb) discrim_loss = LeakyReLU(e_o - g_o + pdist).mean() else: discrim_loss = torch.mean(e_o) - torch.mean(g_o) elif args.EBGAN: e_recon = elementwise_loss(e_o, dis_input_real) g_recon = elementwise_loss(g_o, dis_input_fake) discrim_loss = e_recon if (args.margin - g_recon).item() > 0: discrim_loss += (args.margin - g_recon) elif args.GMMIL: #mmd2_D,K = mix_rbf_mmd2(e_o_enc, g_o_enc, args.sigma_list) mmd2_D, K = mix_rbf_mmd2(dis_input_real, dis_input_fake, args.sigma_list) #tbd #rewards = K[0]+K[1]-2*K[2] rewards = K[1] - K[2] # -(exp - gen): -(kxy-kyy)=kyy-kxy rewards = -rewards.detach( ) # exp - gen, maximize (gen label negative) errD = mmd2_D discrim_loss = -errD # maximize errD # prep for generator advantages, returns = estimate_advantages( rewards, masks, values, args.gamma, args.tau, device) elif args.GEOMGAN: # larger, better, but slower noise_num = 100 mmd2_D, K = mix_imp_mmd2(e_o_enc, g_o_enc, noise_num, noise_dim, kernel_net, cuda) rewards = K[1] - K[2] # -(exp - gen): -(kxy-kyy)=kyy-kxy rewards = -rewards.detach() errD = mmd2_D #+ args.lambda_rg * one_side_errD discrim_loss = -errD # maximize errD # prep for generator advantages, returns = estimate_advantages( rewards, masks, values, args.gamma, args.tau, device) else: discrim_loss = discrim_criterion(g_o, ones((states.shape[0], 1), device=device)) + \ discrim_criterion(e_o, zeros((e_o.shape[0], 1), device=device)) if args.GEOMGAN: optimizer_kernel.step() """perform mini-batch PPO update""" optim_iter_num = int(math.ceil(states.shape[0] / args.ppo_batch_size)) for _ in range(args.generator_epochs): perm = np.arange(states.shape[0]) np.random.shuffle(perm) perm = LongTensor(perm).to(device) states, actions, returns, advantages, fixed_log_probs = \ states[perm].clone(), actions[perm].clone(), returns[perm].clone(), advantages[perm].clone(), \ fixed_log_probs[perm].clone() for i in range(optim_iter_num): ind = slice( i * args.ppo_batch_size, min((i + 1) * args.ppo_batch_size, states.shape[0])) states_b, actions_b, advantages_b, returns_b, fixed_log_probs_b = \ states[ind], actions[ind], advantages[ind], returns[ind], fixed_log_probs[ind] ppo_step(policy_net, value_net, optimizer_policy, optimizer_value, 1, states_b, actions_b, returns_b, advantages_b, fixed_log_probs_b, args.clip_epsilon, args.l2_reg) return rewards if args.GEOMGAN: return policy_net,value_net,discrim_net,kernel_net,optimizer_policy,optimizer_value,optimizer_discrim,optimizer_kernel,agent,update_params \ ,scheduler_policy,scheduler_value,scheduler_discrim,scheduler_kernel else: return policy_net,value_net,discrim_net,optimizer_policy,optimizer_value,optimizer_discrim,agent,update_params \ ,scheduler_policy,scheduler_value,scheduler_discrim
running_state = ZFilter((state_dim,), clip=5) # running_reward = ZFilter((1,), demean=False, clip=10) """define actor and critic""" policy_net = [] value_net = [] if args.model_path is None: if is_disc_action: for i in range(env_dummy.n): policy_net.append(DiscretePolicy(obs_shape_n[i], act_shape_n[i])) # print(policy_net[i]) else: policy_net = Policy(obs_shape_n[i], env_dummy.action_space.shape[0], log_std=args.log_std) # value_net = Value(state_dim) for i in range(env_dummy.n): value_net.append(Value(obs_shape_n[i]*env_dummy.n)) # print(value_net[i]) else: # TODO policy_net, value_net = pickle.load(open(args.model_path, "rb")) # policy_net = [env_dummy.observation_space[i].shape[0] for i in range(env_dummy.n)] if use_gpu: # policy_net = policy_net.cuda() # value_net = value_net.cuda() for i in range(env_dummy.n): policy_net[i].cuda() value_net[i].cuda() optimizer_policy = [] optimizer_value = [] for i in range(env_dummy.n):
def learn_model(args): print("RL result will be saved at %s" % args.rl_filename) print("RL model will be saved at %s" % args.rl_model_filename) if use_gpu: print("Using CUDA.") torch.manual_seed(args.rl_seed) if use_gpu: torch.cuda.manual_seed_all(args.rl_seed) torch.backends.cudnn.deterministic = True np.random.seed(args.rl_seed) random.seed(args.rl_seed) env = gym.make(args.env_name) env.seed(args.rl_seed) env_test = gym.make(args.env_name) env_test.seed(args.rl_seed) state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] a_bound = np.asscalar(env.action_space.high[0]) a_low = np.asscalar(env.action_space.low[0]) assert a_bound == -a_low ## Binary flag for manually cliping actions for step function after adding Gaussian noise. clip = (args.env_name == "LunarLanderContinuous-v2" or args.env_name == "BipedalWalker-v2") print(env.observation_space) print(env.action_space) """define actor and critic""" policy_net = Policy(state_dim, action_dim, log_std=args.log_std, a_bound=a_bound, hidden_size=args.hidden_size, activation=args.activation).to(device) value_net = Value(state_dim, hidden_size=args.hidden_size, activation=args.activation).to(device) optimizer_value = torch.optim.Adam(value_net.parameters(), lr=args.learning_rate_v) decayed_lambda_td = args.lambda_td def update_params_c(batch, i_iter): states = torch.from_numpy(np.stack(batch.state)).float().to(device) actions = torch.from_numpy(np.stack(batch.action)).float().to(device) rewards = torch.from_numpy(np.stack(batch.reward)).float().to(device) masks = torch.from_numpy(np.stack(batch.mask).astype( np.float32)).to(device) """get advantage estimation from the trajectories""" values = value_net(states).data advantages, lambda_returns, mc_returns = estimate_advantages( rewards, masks, values, args.gamma, args.tau) if args.lamret: returns = lambda_returns else: returns = mc_returns """perform critic update""" #gae_step(value_net, optimizer_value, states, lambda_returns, args.l2_reg) # full batch GD gae_step_epoch(value_net, optimizer_value, states, returns, args.l2_reg) # Stochastic GD """ Function to update the parameters of value and policy networks""" def update_params_p(batch, i_iter): nonlocal decayed_lambda_td states = torch.from_numpy(np.stack(batch.state)).float().to(device) actions = torch.from_numpy(np.stack(batch.action)).float().to(device) next_states = torch.from_numpy(np.stack( batch.next_state)).float().to(device) rewards = torch.from_numpy(np.stack(batch.reward)).float().to(device) masks = torch.from_numpy(np.stack(batch.mask).astype( np.float32)).to(device) """get advantage estimation from the trajectories, this is done after gae_step update""" values = value_net(states).data advantages, lambda_returns, mc_returns = estimate_advantages( rewards, masks, values, gamma=args.gamma, tau=args.tau) if args.method_name == "TRPO-RET-MC": returns = mc_returns.detach( ) # detach() does not matter since we back prop policy network only. elif args.method_name == "TRPO-RET-GAE": returns = lambda_returns.detach( ) # detach() does not matter actually. else: returns = 0 # returns is not used for TRPO and TRPO-TD. # standardize or not ? if args.mgae: advantages = (advantages - advantages.mean() ) / advantages.std() # this will be m-std version else: advantages = advantages / advantages.std( ) # this will be std version trpo_step_td(policy_net=policy_net, value_net=value_net, states=states, actions=actions, next_states=next_states, rewards=rewards, masks=masks, gamma=args.gamma, advantages=advantages, \ max_kl=args.max_kl, damping=args.damping, \ lambda_td=decayed_lambda_td, method_name=args.method_name, returns=returns, mtd=args.mtd) """ decay the td_reg parameter after update """ decayed_lambda_td = decayed_lambda_td * args.decay_td """create agent""" agent = Agent(env, policy_net, render=False) agent_test = Agent(env_test, policy_net, mean_action=True, render=args.render) """ The actual learning loop""" for i_iter in range(args.rl_max_iter_num): """ Save the learned policy model """ if ( (i_iter) % args.rl_save_model_interval == 0 and args.rl_save_model_interval > 0 ) \ or (i_iter == args.rl_max_iter_num + 1) or i_iter == 0: policy_net = policy_net.to(device_cpu) value_net = value_net.to(device_cpu) pickle.dump((policy_net, value_net), open(args.rl_model_filename + ("_I%d.p" % (i_iter)), 'wb')) policy_net = policy_net.to(device) value_net = value_net.to(device) """ Test the policy before update """ if i_iter % args.log_interval == 0 or i_iter + 1 == args.rl_max_iter_num: _, log_test = agent_test.collect_samples_test(max_num_episodes=20, render=args.render, clip=clip) """generate multiple trajectories that reach the minimum batch_size""" t0 = time.time() batch, log = agent.collect_samples_train( args.min_batch_size, render=False, clip=clip) # this is on-policy samples t1 = time.time() """ update parameters """ t0_d = time.time() update_params_c(batch, i_iter) #critic update update_params_p(batch, i_iter) #actor update t1_d = time.time() """ Print out result to stdout and save it to a text file for later usage""" if i_iter % args.log_interval == 0: result_text = t_format("Iter %6d (%2.2fs)+(%2.2fs)" % (i_iter, t1 - t0, t1_d - t0_d)) result_text += " | [R] " + t_format( "Avg: %.2f (%.2f)" % (log['avg_reward'], log['std_reward']), 2) result_text += " | [R_test] " + t_format("min: %.2f" % log_test['min_reward'], 1) + t_format("max: %.2f" % log_test['max_reward'], 1) \ + t_format("Avg: %.2f (%.2f)" % (log_test['avg_reward'], log_test['std_reward']), 2) print(result_text) with open(args.rl_filename, 'a') as f: print(result_text, file=f)
env_dummy = env_factory(0) state_dim = env_dummy.observation_space.shape[0] is_disc_action = len(env_dummy.action_space.shape) == 0 ActionTensor = LongTensor if is_disc_action else DoubleTensor running_state = ZFilter((state_dim, ), clip=5) # running_reward = ZFilter((1,), demean=False, clip=10) """define actor and critic""" if args.model_path is None: if is_disc_action: policy_net = DiscretePolicy(state_dim, env_dummy.action_space.n) else: policy_net = Policy(state_dim, env_dummy.action_space.shape[0], log_std=args.log_std) value_net = Value(state_dim) else: policy_net, value_net, running_state = pickle.load( open(args.model_path, "rb")) if use_gpu: policy_net = policy_net.cuda() value_net = value_net.cuda() del env_dummy """create agent""" agent = Agent(env_factory, policy_net, running_state=running_state, render=args.render, num_threads=args.num_threads)
# running_reward = ZFilter((1,), demean=False, clip=10) running_state = None """seeding""" np.random.seed(args.seed) torch.manual_seed(args.seed) env.seed(args.seed) """define actor and critic""" if args.model_path is None: if is_disc_action: policy_net = DiscretePolicy(state_dim, env.action_space.n) else: policy_mgr = DiscretePolicy(state_dim, 7) policy_wrk = Policy(state_dim + subgoal_dim, env.action_space.shape[0], log_std=args.log_std) value_mgr = Value(state_dim) value_wrk = Value(state_dim + subgoal_dim) else: policy_net, value_net, running_state = pickle.load( open(args.model_path, "rb")) policy_mgr.to(device) policy_wrk.to(device) value_mgr.to(device) value_wrk.to(device) # optim_policy_m = torch.optim.Adam(policy_mgr.parameters(), lr=0.01) # optim_policy_w = torch.optim.Adam(policy_wrk.parameters(), lr=0.01) # optim_value_m = torch.optim.Adam(value_mgr.parameters(), lr=0.01) # optim_value_w = torch.optim.Adam(value_wrk.parameters(), lr=0.01) """create agent""" agent = Agent(env,
env_dummy = env_factory(0) state_dim = env_dummy.observation_space.shape[0] is_disc_action = len(env_dummy.action_space.shape) == 0 ActionTensor = LongTensor if is_disc_action else DoubleTensor running_state = ZFilter((state_dim,), clip=5) # running_reward = ZFilter((1,), demean=False, clip=10) """define actor and critic""" if args.model_path is None: if is_disc_action: policy_net = DiscretePolicy(state_dim, env_dummy.action_space.n, hidden_size=(20,20)) else: policy_net = Policy(state_dim, env_dummy.action_space.shape[0], log_std=args.log_std, hidden_size=(3,3)) value_net = Value(state_dim) else: policy_net, value_net, running_state = pickle.load(open(args.model_path, "rb")) if use_gpu: policy_net = policy_net.cuda() value_net = value_net.cuda() del env_dummy optimizer_policy = torch.optim.Adam(policy_net.parameters(), lr=args.learning_rate) optimizer_value = torch.optim.Adam(value_net.parameters(), lr=args.learning_rate) # optimization epoch number and batch size for PPO optim_epochs = 5 optim_batch_size = 64 """create agent"""
print(H) """define actor and critic""" size = (128, 128) policy_size = (64, 64) critic_size = size #(8, 8) advantage_size = size #(8, 8) if args.model_path is None: if is_disc_action: policy_net = DiscretePolicy(state_dim, env_dummy.action_space.n) else: policy_net = Policy(state_dim, env_dummy.action_space.shape[0], hidden_size=policy_size, scale_cov=args.scale_cov) #policy_net = Policy(state_dim, env_dummy.action_space.shape[0], hidden_size=policy_size, log_std=0) value_net = Value(state_dim, hidden_size=critic_size) advantage_net = Advantage((state_dim, action_dim), hidden_size=advantage_size) else: policy_net, value_net, advantage_net, running_state = pickle.load( open(args.model_path, "rb")) if use_gpu: policy_net = policy_net.cuda() value_net = value_net.cuda() advantage_net = advantage_net.cuda() del env_dummy optimizer_policy = torch.optim.Adam(policy_net.parameters(), lr=args.learning_rate) optimizer_value = torch.optim.Adam(value_net.parameters(), lr=args.learning_rate)
metavar='N', help="pretrain discriminator iteration (default: 30)") args = parser.parse_args() use_gpu = True np.random.seed(args.seed) torch.manual_seed(args.seed) if use_gpu: torch.cuda.manual_seed_all(args.seed) is_disc_action = False action_dim = 10 ActionTensor = DoubleTensor """define actor, critic and discrimiator""" policy_net = Policy(10, 256, 10, num_layers=2) value_net = Value(10, 256, num_layers=3) discrim_net = Discriminator(10, 256, 10, num_layers=3) discrim_criterion = nn.BCELoss() ##################################################### ### Load Models load_models = True if load_models: print("Loading Models") policy_net, value_net, discrim_net = pickle.load( open('learned_models/nextaction_pretrain_sigpolicy.p', 'rb')) #_, _, discrim_net = pickle.load(open('learned_models/nextaction_trained_sigpolicy.p', 'rb')) print("Loading Models Finished") ##################################################### if use_gpu:
if args.model_path is None: if is_disc_action: policy_net = DiscretePolicy(state_dim, env.action_space.n) else: if args.sac_policy: policy_net = Policy_Tanh_Gaussian(state_dim, env.action_space.shape[0], hidden_size=(64, 64), log_std=args.log_std) else: policy_net = Policy(state_dim, env.action_space.shape[0], log_std=args.log_std) value_net = Value(state_dim) else: policy_net, value_net, running_state, = pickle.load( open(args.model_path, "rb")) policy_net.to(device) value_net.to(device) optimizer_policy = torch.optim.Adam(policy_net.parameters(), lr=args.learning_rate) optimizer_value = torch.optim.Adam(value_net.parameters(), lr=args.learning_rate) params = list(policy_net.parameters()) + list(value_net.parameters()) unique_optimizer = torch.optim.Adam(params, lr=args.learning_rate)
def train_v_upper_envelope(states, actions, returns, state_dim, device, seed, upper_learning_rate=3e-3, weight_decay=0.02, max_step_num=int(1e6), consecutive_steps=4, k=10000): states = torch.from_numpy(np.array(states)) actions = torch.from_numpy(np.array(actions)) returns = torch.from_numpy(np.array(returns)) # returns is actually Gts use_gpu = True if device == "cuda:0" else False # Init upper_envelope net (*use relu as activation function upper_envelope = Value(state_dim, activation='relu') upper_envelope_retrain = Value(state_dim, activation='relu') optimizer_upper = torch.optim.Adam(upper_envelope.parameters(), lr=upper_learning_rate, weight_decay=weight_decay) optimizer_upper_retrain = torch.optim.Adam( upper_envelope_retrain.parameters(), lr=upper_learning_rate, weight_decay=weight_decay) if use_gpu: upper_envelope = upper_envelope.cuda() upper_envelope_retrain = upper_envelope_retrain.cuda() # =========================== # # Split data into training and testing # # But make sure the highest Ri is in the training set # pick out the highest data point highestR, indice = torch.max(returns, 0) highestR = highestR.view(-1, 1) highestS = states[indice] highestA = actions[indice] print("HighestR:", highestR) statesW = torch.cat((states[:indice], states[indice + 1:])) actionsW = torch.cat((actions[:indice], actions[indice + 1:])) returnsW = torch.cat((returns[:indice], returns[indice + 1:])) # shuffle the data perm = np.arange(statesW.shape[0]) np.random.shuffle(perm) perm = LongTensor(perm).cuda() if use_gpu else LongTensor(perm) statesW, actionsW, returnsW = statesW[perm], actionsW[perm], returnsW[perm] # divide data into train/test divide = int(states.shape[0] * 0.8) train_states, train_actions, train_returns = statesW[: divide], actionsW[: divide], returnsW[: divide] test_states, test_actions, test_returns = statesW[divide:], actionsW[ divide:], returnsW[divide:] # add the highest data into training print(train_states.size(), highestS.size()) print(train_actions.size(), highestA.size()) print(train_returns.size(), highestR.size()) train_states = torch.cat((train_states.squeeze(), highestS.unsqueeze(0))) train_actions = torch.cat((train_actions.squeeze(), highestA.unsqueeze(0))) train_returns = torch.cat( (train_returns.squeeze(), highestR.squeeze().unsqueeze(0))) # train upper envelope # env_dummy = env_factory(0) # state_dim = env_dummy.observation_space.shape[0] # upper_envelope = Value(state_dim) # optimizer = torch.optim.Adam(upper_envelope.parameters(), lr=0.003, weight_decay=20) epoch_n = 100 batch_size = 64 optim_iter_num = int(math.ceil(train_states.shape[0] / batch_size)) num_increase = 0 previous_loss = math.inf calculate_vali = 2 best_parameters = upper_envelope.state_dict() running_traning_steps = 0 best_training_steps = running_traning_steps # Upper Envelope Training starts upper_envelope.train() while num_increase < consecutive_steps: # update theta for n steps, n = calculate_vali # train calculate_vali steps for i in range(calculate_vali): train_loss = 0 perm = np.arange(train_states.shape[0]) np.random.shuffle(perm) perm = LongTensor(perm).cuda() if use_gpu else LongTensor(perm) train_states, train_actions, train_returns = train_states[ perm], train_actions[perm], train_returns[perm] for i in range(optim_iter_num): ind = slice(i * batch_size, min((i + 1) * batch_size, states.shape[0])) states_b, returns_b = train_states[ind], train_returns[ind] states_b = Variable(states_b.float()) returns_b = Variable(returns_b.float()) Vsi = upper_envelope(states_b) # loss = loss_fn(Vsi, returns_b) loss = L2PenaltyLoss(Vsi, returns_b, k_val=k) train_loss += loss.detach() upper_envelope.zero_grad() loss.backward() optimizer_upper.step() # early stopping running_traning_steps += calculate_vali # calculate validation error test_iter = int(math.ceil(test_states.shape[0] / batch_size)) validation_loss = 0 for n in range(test_iter): ind = slice(n * batch_size, min((n + 1) * batch_size, states.shape[0])) states_t, returns_t = test_states[ind], test_returns[ind] states_t = Variable(states_t.float()) returns_t = Variable(returns_t.float()) Vsi = upper_envelope(states_t) loss = L2PenaltyLoss(Vsi, returns_t, k_val=k) validation_loss += loss if validation_loss < previous_loss: best_training_steps = running_traning_steps previous_loss = validation_loss best_parameters = upper_envelope.state_dict() num_increase = 0 else: num_increase += 1 print("best_training_steps:", best_training_steps) upper_envelope.load_state_dict(best_parameters) # retrain on the whole set upper_envelope_retrain.train() optim_iter_num = int(math.ceil(states.shape[0] / batch_size)) for i in range(best_training_steps): train_loss = 0 perm = np.arange(states.shape[0]) np.random.shuffle(perm) perm = LongTensor(perm).cuda() if use_gpu else LongTensor(perm) states, actions, returns = states[perm], actions[perm], returns[perm] for i in range(optim_iter_num): ind = slice(i * batch_size, min((i + 1) * batch_size, states.shape[0])) states_b, returns_b = states[ind], returns[ind] states_b = Variable(states_b.float()) returns_b = Variable(returns_b.float()) Vsi = upper_envelope_retrain(states_b) #loss = loss_fn(Vsi, returns_b) loss = L2PenaltyLoss(Vsi, returns_b, k_val=k) train_loss += loss.detach() upper_envelope_retrain.zero_grad() loss.backward() optimizer_upper_retrain.step() upper_envelope.load_state_dict(upper_envelope_retrain.state_dict()) print("Policy training is complete.") return upper_envelope
class MAGAIL: def __init__(self, config, log_dir, exp_name): self.config = config self.exp_name = exp_name self.writer = SummaryWriter(log_dir=f"{log_dir}/{self.exp_name}") """seeding""" seed = self.config["general"]["seed"] torch.manual_seed(seed) np.random.seed(seed) self._load_expert_data() self._init_model() def _init_model(self): self.V = Value(num_states=self.config["value"]["num_states"], num_hiddens=self.config["value"]["num_hiddens"], drop_rate=self.config["value"]["drop_rate"], activation=self.config["value"]["activation"]) self.P = JointPolicy( initial_state=self.expert_dataset.state.to(device), config=self.config["jointpolicy"]) self.D = Discriminator( num_states=self.config["discriminator"]["num_states"], num_actions=self.config["discriminator"]["num_actions"], num_hiddens=self.config["discriminator"]["num_hiddens"], drop_rate=self.config["discriminator"]["drop_rate"], use_noise=self.config["discriminator"]["use_noise"], noise_std=self.config["discriminator"]["noise_std"], activation=self.config["discriminator"]["activation"]) print("Model Structure") print(self.P) print(self.V) print(self.D) print() self.optimizer_policy = optim.Adam( self.P.parameters(), lr=self.config["jointpolicy"]["learning_rate"]) self.optimizer_value = optim.Adam( self.V.parameters(), lr=self.config["value"]["learning_rate"]) self.optimizer_discriminator = optim.Adam( self.D.parameters(), lr=self.config["discriminator"]["learning_rate"]) self.scheduler_discriminator = optim.lr_scheduler.StepLR( self.optimizer_discriminator, step_size=2000, gamma=0.95) self.discriminator_func = nn.BCELoss() to_device(self.V, self.P, self.D, self.D, self.discriminator_func) def _load_expert_data(self): num_expert_states = self.config["general"]["num_states"] num_expert_actions = self.config["general"]["num_actions"] expert_batch_size = self.config["general"]["expert_batch_size"] self.expert_dataset = ExpertDataSet( data_set_path=self.config["general"]["expert_data_path"], num_states=num_expert_states, num_actions=num_expert_actions) self.expert_data_loader = DataLoader( dataset=self.expert_dataset, batch_size=expert_batch_size, shuffle=True, num_workers=multiprocessing.cpu_count() // 2) def train(self, epoch): self.P.train() self.D.train() self.V.train() # collect generated batch gen_batch = self.P.collect_samples( self.config["ppo"]["sample_batch_size"]) # batch: ('state', 'action', 'next_state', 'log_prob', 'mask') gen_batch_state = trans_shape_func( torch.stack(gen_batch.state )) # [trajectory length * parallel size, state size] gen_batch_action = trans_shape_func( torch.stack(gen_batch.action )) # [trajectory length * parallel size, action size] gen_batch_next_state = trans_shape_func( torch.stack(gen_batch.next_state) ) # [trajectory length * parallel size, state size] gen_batch_old_log_prob = trans_shape_func( torch.stack( gen_batch.log_prob)) # [trajectory length * parallel size, 1] gen_batch_mask = trans_shape_func(torch.stack( gen_batch.mask)) # [trajectory length * parallel size, 1] # grad_collect_func = lambda d: torch.cat([grad.view(-1) for grad in torch.autograd.grad(d, self.D.parameters(), retain_graph=True)]).unsqueeze(0) #################################################### # update discriminator #################################################### for expert_batch_state, expert_batch_action in self.expert_data_loader: gen_r = self.D(gen_batch_state, gen_batch_action) expert_r = self.D(expert_batch_state.to(device), expert_batch_action.to(device)) # label smoothing for discriminator expert_labels = torch.ones_like(expert_r) gen_labels = torch.zeros_like(gen_r) if self.config["discriminator"]["use_label_smoothing"]: smoothing_rate = self.config["discriminator"][ "label_smooth_rate"] expert_labels *= (1 - smoothing_rate) gen_labels += torch.ones_like(gen_r) * smoothing_rate e_loss = self.discriminator_func(expert_r, expert_labels) g_loss = self.discriminator_func(gen_r, gen_labels) d_loss = e_loss + g_loss # """ WGAN with Gradient Penalty""" # d_loss = gen_r.mean() - expert_r.mean() # differences_batch_state = gen_batch_state[:expert_batch_state.size(0)] - expert_batch_state # differences_batch_action = gen_batch_action[:expert_batch_action.size(0)] - expert_batch_action # alpha = torch.rand(expert_batch_state.size(0), 1) # interpolates_batch_state = gen_batch_state[:expert_batch_state.size(0)] + (alpha * differences_batch_state) # interpolates_batch_action = gen_batch_action[:expert_batch_action.size(0)] + (alpha * differences_batch_action) # gradients = torch.cat([x for x in map(grad_collect_func, self.D(interpolates_batch_state, interpolates_batch_action))]) # slopes = torch.norm(gradients, p=2, dim=-1) # gradient_penalty = torch.mean((slopes - 1.) ** 2) # d_loss += 10 * gradient_penalty self.optimizer_discriminator.zero_grad() d_loss.backward() self.optimizer_discriminator.step() self.scheduler_discriminator.step() self.writer.add_scalar('train/loss/d_loss', d_loss.item(), epoch) self.writer.add_scalar("train/loss/e_loss", e_loss.item(), epoch) self.writer.add_scalar("train/loss/g_loss", g_loss.item(), epoch) self.writer.add_scalar('train/reward/expert_r', expert_r.mean().item(), epoch) self.writer.add_scalar('train/reward/gen_r', gen_r.mean().item(), epoch) with torch.no_grad(): gen_batch_value = self.V(gen_batch_state) gen_batch_reward = self.D(gen_batch_state, gen_batch_action) gen_batch_advantage, gen_batch_return = estimate_advantages( gen_batch_reward, gen_batch_mask, gen_batch_value, self.config["gae"]["gamma"], self.config["gae"]["tau"], self.config["jointpolicy"]["trajectory_length"]) #################################################### # update policy by ppo [mini_batch] #################################################### ppo_optim_epochs = self.config["ppo"]["ppo_optim_epochs"] ppo_mini_batch_size = self.config["ppo"]["ppo_mini_batch_size"] gen_batch_size = gen_batch_state.shape[0] optim_iter_num = int(math.ceil(gen_batch_size / ppo_mini_batch_size)) for _ in range(ppo_optim_epochs): perm = torch.randperm(gen_batch_size) for i in range(optim_iter_num): ind = perm[slice( i * ppo_mini_batch_size, min((i + 1) * ppo_mini_batch_size, gen_batch_size))] mini_batch_state, mini_batch_action, mini_batch_next_state, mini_batch_advantage, mini_batch_return, \ mini_batch_old_log_prob = gen_batch_state[ind], gen_batch_action[ind], gen_batch_next_state[ind], \ gen_batch_advantage[ind], gen_batch_return[ind], gen_batch_old_log_prob[ind] v_loss, p_loss = ppo_step( self.P, self.V, self.optimizer_policy, self.optimizer_value, states=mini_batch_state, actions=mini_batch_action, next_states=mini_batch_next_state, returns=mini_batch_return, old_log_probs=mini_batch_old_log_prob, advantages=mini_batch_advantage, ppo_clip_ratio=self.config["ppo"]["clip_ratio"], value_l2_reg=self.config["value"]["l2_reg"]) self.writer.add_scalar('train/loss/p_loss', p_loss, epoch) self.writer.add_scalar('train/loss/v_loss', v_loss, epoch) print(f" Training episode:{epoch} ".center(80, "#")) print('gen_r:', gen_r.mean().item()) print('expert_r:', expert_r.mean().item()) print('d_loss', d_loss.item()) def eval(self, epoch): self.P.eval() self.D.eval() self.V.eval() gen_batch = self.P.collect_samples( self.config["ppo"]["sample_batch_size"]) gen_batch_state = torch.stack(gen_batch.state) gen_batch_action = torch.stack(gen_batch.action) gen_r = self.D(gen_batch_state, gen_batch_action) for expert_batch_state, expert_batch_action in self.expert_data_loader: expert_r = self.D(expert_batch_state.to(device), expert_batch_action.to(device)) print(f" Evaluating episode:{epoch} ".center(80, "-")) print('validate_gen_r:', gen_r.mean().item()) print('validate_expert_r:', expert_r.mean().item()) self.writer.add_scalar("validate/reward/gen_r", gen_r.mean().item(), epoch) self.writer.add_scalar("validate/reward/expert_r", expert_r.mean().item(), epoch) def save_model(self, save_path): if not os.path.exists(save_path): os.mkdir(save_path) # dump model from pkl file # torch.save((self.D, self.P, self.V), f"{save_path}/{self.exp_name}.pt") torch.save(self.D, f"{save_path}/{self.exp_name}_Discriminator.pt") torch.save(self.P, f"{save_path}/{self.exp_name}_JointPolicy.pt") torch.save(self.V, f"{save_path}/{self.exp_name}_Value.pt") def load_model(self, model_path): # load entire model # self.D, self.P, self.V = torch.load((self.D, self.P, self.V), f"{save_path}/{self.exp_name}.pt") self.D = torch.load(f"{model_path}_Discriminator.pt", map_location=device) self.P = torch.load(f"{model_path}_JointPolicy.pt", map_location=device) self.V = torch.load(f"{model_path}_Value.pt", map_location=device)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device( 'cpu') if torch.cuda.is_available(): torch.cuda.set_device(args.gpu_index) env = gym.make(args.env_name) state_dim = env.observation_space.shape[0] is_disc_action = len(env.action_space.shape) == 0 action_dim = 1 if is_disc_action else env.action_space.shape[0] running_reward = ZFilter((1, ), demean=False, clip=10) print("Seed: {}".format(args.seed)) np.random.seed(args.seed) torch.manual_seed(args.seed) env.seed(args.seed) policy_net = Policy(state_dim, env.action_space.shape[0], log_std=args.log_std) value_net = Value( state_dim ) # Initialise value network. Used for calculating Advantages for TRPO critic_net = OTCritic(state_dim + action_dim) # Initialise OT critic if args.resume_training: policy_net, value_net, critic_net, running_state, running_reward = pickle.load( open( 'assets/learned_models/ablation/SIL/{}/{}_SIL_s{}.p'.format( args.dataset_size, args.env_name, args.seed), "rb")) to_device(device, policy_net, value_net, critic_net) optimizer_ot = torch.optim.Adam(critic_net.parameters(), lr=args.critic_lr) optimizer_policy = torch.optim.Adam(policy_net.parameters(), lr=args.learning_rate) optimizer_value = torch.optim.Adam(value_net.parameters(), lr=args.learning_rate) #OT params
def create_networks(): """define actor and critic""" if is_disc_action: policy_net = DiscretePolicy(state_dim, env.action_space.n, hidden_size=(64, 32), activation='relu') else: policy_net = Policy(state_dim, env.action_space.shape[0], log_std=args.log_std, hidden_size=(64, 32), activation='relu') value_net = Value(state_dim, hidden_size=(32, 16), activation='relu') if args.AL: discrim_net = SNDiscriminator(state_dim + action_dim, hidden_size=(32, 16), activation='relu') elif args.EBGAN or args.GMMIL: discrim_net = AESNDiscriminator(state_dim + action_dim, hidden_size=(32, ), encode_size=64, activation='leakyrelu', slope=0.1, dropout=True, dprob=0.2) elif args.VAKLIL: noise_dim = 64 mid_dim = 32 discrim_net = VAEDiscriminator(state_dim + action_dim, num_outputs=noise_dim, sigmoid_out=False, sn=True, test=False, w_init=False, hidden_size_enc=(), hidden_size_dec=(), encode_size=mid_dim, activation='relu', dropout=False) kernel_net = NoiseNet(noise_dim, hidden_size=(32, ), encode_size=noise_dim, activation='relu', dropout=False) optimizer_kernel = torch.optim.Adam(kernel_net.parameters(), lr=args.learning_rate) scheduler_kernel = MultiStepLR(optimizer_kernel, milestones=args.milestones, gamma=args.lr_kernel_decay) else: discrim_net = Discriminator(state_dim + action_dim, hidden_size=(32, 16), activation='relu') optimizer_policy = torch.optim.Adam(policy_net.parameters(), lr=args.learning_rate) optimizer_value = torch.optim.Adam(value_net.parameters(), lr=args.learning_rate) optimizer_discrim = torch.optim.Adam(discrim_net.parameters(), lr=args.learning_rate) scheduler_policy = MultiStepLR(optimizer_policy, milestones=args.milestones, gamma=args.lr_decay) scheduler_value = MultiStepLR(optimizer_value, milestones=args.milestones, gamma=args.lr_decay) scheduler_discrim = MultiStepLR(optimizer_discrim, milestones=args.milestones, gamma=args.lr_kernel_decay) if args.AL: class ExpertReward(): def __init__(self): self.a = 0 def expert_reward(self, state, action): state_action = tensor(np.hstack([state, action]), dtype=dtype) with torch.no_grad(): return -discrim_net(state_action)[0].item() learned_reward = ExpertReward() elif args.EBGAN: class ExpertReward(): def __init__(self): self.a = 0 def expert_reward(self, state, action): state_action = tensor(np.hstack([state, action]), dtype=dtype) with torch.no_grad(): _, recon_out = discrim_net(state_action) return -elementwise_loss( recon_out, state_action).item() + args.r_margin learned_reward = ExpertReward() elif args.GMMIL or args.VAKLIL: class ExpertReward(): def __init__(self): self.r_bias = 0 def expert_reward(self, state, action): with torch.no_grad(): return self.r_bias def update_XX_YY(self): self.XX = torch.diag(torch.mm(self.e_o_enc, self.e_o_enc.t())) self.YY = torch.diag(torch.mm(self.g_o_enc, self.g_o_enc.t())) learned_reward = ExpertReward() else: class ExpertReward(): def __init__(self): self.a = 0 def expert_reward(self, state, action): state_action = tensor(np.hstack([state, action]), dtype=dtype) with torch.no_grad(): return -math.log(discrim_net(state_action)[0].item()) learned_reward = ExpertReward() """create agent""" agent = Agent(env, policy_net, device, custom_reward=learned_reward, running_state=None, render=args.render, num_threads=args.num_threads) def update_params(batch, i_iter): dataSize = min(args.min_batch_size, len(batch.state)) states = torch.from_numpy(np.stack( batch.state)[:dataSize, ]).to(dtype).to(device) actions = torch.from_numpy(np.stack( batch.action)[:dataSize, ]).to(dtype).to(device) rewards = torch.from_numpy(np.stack( batch.reward)[:dataSize, ]).to(dtype).to(device) masks = torch.from_numpy(np.stack( batch.mask)[:dataSize, ]).to(dtype).to(device) with torch.no_grad(): values = value_net(states) fixed_log_probs = policy_net.get_log_prob(states, actions) """estimate reward""" """get advantage estimation from the trajectories""" advantages, returns = estimate_advantages(rewards, masks, values, args.gamma, args.tau, device) """update discriminator""" for _ in range(args.discriminator_epochs): exp_idx = random.sample(range(expert_traj.shape[0]), dataSize) expert_state_actions = torch.from_numpy( expert_traj[exp_idx, :]).to(dtype).to(device) dis_input_real = expert_state_actions if len(actions.shape) == 1: actions.unsqueeze_(-1) dis_input_fake = torch.cat([states, actions], 1) actions.squeeze_(-1) else: dis_input_fake = torch.cat([states, actions], 1) if args.EBGAN or args.GMMIL or args.VAKLIL: g_o_enc, g_mu, g_sigma = discrim_net(dis_input_fake, mean_mode=False) e_o_enc, e_mu, e_sigma = discrim_net(dis_input_real, mean_mode=False) else: g_o = discrim_net(dis_input_fake) e_o = discrim_net(dis_input_real) optimizer_discrim.zero_grad() if args.VAKLIL: optimizer_kernel.zero_grad() if args.AL: if args.LSGAN: pdist = l1dist(dis_input_real, dis_input_fake).mul(args.lamb) discrim_loss = LeakyReLU(e_o - g_o + pdist).mean() else: discrim_loss = torch.mean(e_o) - torch.mean(g_o) elif args.EBGAN: e_recon = elementwise_loss(e_o, dis_input_real) g_recon = elementwise_loss(g_o, dis_input_fake) discrim_loss = e_recon if (args.margin - g_recon).item() > 0: discrim_loss += (args.margin - g_recon) elif args.GMMIL: mmd2_D, K = mix_rbf_mmd2(e_o_enc, g_o_enc, args.sigma_list) rewards = K[1] - K[2] # -(exp - gen): -(kxy-kyy)=kyy-kxy rewards = -rewards.detach( ) # exp - gen, maximize (gen label negative) errD = mmd2_D discrim_loss = -errD # maximize errD advantages, returns = estimate_advantages( rewards, masks, values, args.gamma, args.tau, device) elif args.VAKLIL: noise_num = 20000 mmd2_D_net, _, penalty = mix_imp_with_bw_mmd2( e_o_enc, g_o_enc, noise_num, noise_dim, kernel_net, cuda, args.sigma_list) mmd2_D_rbf, _ = mix_rbf_mmd2(e_o_enc, g_o_enc, args.sigma_list) errD = (mmd2_D_net + mmd2_D_rbf) / 2 # 1e-8: small number for numerical stability i_c = 0.2 bottleneck_loss = torch.mean((0.5 * torch.sum((torch.cat( (e_mu, g_mu), dim=0)**2) + (torch.cat( (e_sigma, g_sigma), dim=0)**2) - torch.log((torch.cat( (e_sigma, g_sigma), dim=0)**2) + 1e-8) - 1, dim=1))) - i_c discrim_loss = -errD + (args.beta * bottleneck_loss) + ( args.lambda_h * penalty) else: discrim_loss = discrim_criterion(g_o, ones((states.shape[0], 1), device=device)) + \ discrim_criterion(e_o, zeros((e_o.shape[0], 1), device=device)) discrim_loss.backward() optimizer_discrim.step() if args.VAKLIL: optimizer_kernel.step() if args.VAKLIL: with torch.no_grad(): noise_num = 20000 g_o_enc, _, _ = discrim_net(dis_input_fake) e_o_enc, _, _ = discrim_net(dis_input_real) _, K_net, _ = mix_imp_with_bw_mmd2(e_o_enc, g_o_enc, noise_num, noise_dim, kernel_net, cuda, args.sigma_list) _, K_rbf = mix_rbf_mmd2(e_o_enc, g_o_enc, args.sigma_list) K = [sum(x) / 2 for x in zip(K_net, K_rbf)] rewards = K[1] - K[2] # -(exp - gen): -(kxy-kyy)=kyy-kxy rewards = -rewards #.detach() advantages, returns = estimate_advantages( rewards, masks, values, args.gamma, args.tau, device) """perform mini-batch PPO update""" optim_iter_num = int(math.ceil(states.shape[0] / args.ppo_batch_size)) for _ in range(args.generator_epochs): perm = np.arange(states.shape[0]) np.random.shuffle(perm) perm = LongTensor(perm).to(device) states, actions, returns, advantages, fixed_log_probs = \ states[perm].clone(), actions[perm].clone(), returns[perm].clone(), advantages[perm].clone(), \ fixed_log_probs[perm].clone() for i in range(optim_iter_num): ind = slice( i * args.ppo_batch_size, min((i + 1) * args.ppo_batch_size, states.shape[0])) states_b, actions_b, advantages_b, returns_b, fixed_log_probs_b = \ states[ind], actions[ind], advantages[ind], returns[ind], fixed_log_probs[ind] ppo_step(policy_net, value_net, optimizer_policy, optimizer_value, 1, states_b, actions_b, returns_b, advantages_b, fixed_log_probs_b, args.clip_epsilon, args.l2_reg) return rewards if args.VAKLIL: return policy_net,value_net,discrim_net,kernel_net,optimizer_policy,optimizer_value,optimizer_discrim,optimizer_kernel,agent,update_params \ ,scheduler_policy,scheduler_value,scheduler_discrim,scheduler_kernel else: return policy_net,value_net,discrim_net,optimizer_policy,optimizer_value,optimizer_discrim,agent,update_params \ ,scheduler_policy,scheduler_value,scheduler_discrim
def update_params(batch, i_iter, opt): """update discriminator""" reirl_weights.write( reirl(expert_traj[:, :-action_dim], np.stack(batch.state), opt)) value_net = Value(state_dim) optimizer_value = torch.optim.Adam(value_net.parameters(), lr=args.learning_rate) if i_iter > 0: j_max = 3 #if i_iter < 20 else 15 for j in range(j_max): #3): batch, log = ppo_agent.collect_samples(3000) print('{}\tT_sample {}\texpert_R_avg {}\tR_avg {}'.format( j, log['sample_time'], log['avg_c_reward'], log['avg_reward'])) states = torch.from_numpy(np.stack( batch.state)).to(dtype).to(device) player_actions = torch.from_numpy(np.stack( batch.player_action)).to(dtype).to(device) opponent_actions = torch.from_numpy(np.stack( batch.opponent_action)).to(dtype).to(device) rewards = torch.from_numpy(np.stack( batch.reward)).to(dtype).to(device) masks = torch.from_numpy(np.stack(batch.mask)).to(dtype).to(device) with torch.no_grad(): values = value_net(states) fixed_log_probs = policy_net.get_log_prob( states, player_actions) opponent_fixed_log_probs = opponent_net.get_log_prob( states, opponent_actions) """get advantage estimation from the trajectories""" advantages, returns = estimate_advantages(rewards, masks, values, args.gamma, args.tau, device) """perform mini-batch PPO update""" optim_iter_num = int(math.ceil(states.shape[0] / optim_batch_size)) for _ in range(optim_epochs): perm = np.arange(states.shape[0]) np.random.shuffle(perm) perm = LongTensor(perm).to(device) states, player_actions, opponent_actions, returns, advantages, fixed_log_probs, opponent_fixed_log_probs = \ states[perm].clone(), player_actions[perm].clone(), \ opponent_actions[perm].clone(), returns[perm].clone(), \ advantages[perm].clone(), \ fixed_log_probs[perm].clone(), opponent_fixed_log_probs[ perm].clone() for i in range(optim_iter_num): ind = slice( i * optim_batch_size, min((i + 1) * optim_batch_size, states.shape[0])) states_b, player_actions_b, opponent_actions_b, advantages_b, returns_b, fixed_log_probs_b, opponent_fixed_log_probs_b = \ states[ind], player_actions[ind], opponent_actions[ind], \ advantages[ind], returns[ind], fixed_log_probs[ind], \ opponent_fixed_log_probs[ind] # Update the player ppo_step(policy_net, value_net, optimizer_policy, optimizer_value, 1, states_b, player_actions_b, returns_b, advantages_b, fixed_log_probs_b, args.clip_epsilon, args.l2_reg, max_grad=max_grad) # Update the opponent ppo_step(opponent_net, value_net, optimizer_opponent, optimizer_value, 1, states_b, opponent_actions_b, returns_b, advantages_b, opponent_fixed_log_probs_b, args.clip_epsilon, args.l2_reg, opponent=True, max_grad=max_grad)
class DDPG: def __init__( self, env=None, render=False, num_process=1, memory_size=1000000, lr_p=1e-3, lr_v=1e-3, gamma=0.99, polyak=0.995, explore_size=10000, batch_size=100, min_update_step=1000, update_step=50, action_noise=0.1, seed=1, ): self.env = env self.render = render self.gamma = gamma self.polyak = polyak self.memory = FixedMemory(memory_size) self.explore_size = explore_size self.num_process = num_process self.lr_p = lr_p self.lr_v = lr_v self.batch_size = batch_size self.min_update_step = min_update_step self.update_step = update_step self.action_noise = action_noise self.seed = seed self._init_model() def _init_model(self): """init model from parameters""" self.num_states = self.env.observation_space.shape[0] self.num_actions = self.env.action_space.shape[0] self.action_low, self.action_high = self.env.action_space.low[ 0], self.env.action_space.high[0] # seeding np.random.seed(self.seed) torch.manual_seed(self.seed) self.env.seed(self.seed) self.policy_net = Actor(self.num_states, self.num_actions, self.action_high).to(device) self.policy_net_target = Actor(self.num_states, self.num_actions, self.action_high).to(device) self.value_net = Value(self.num_states + self.num_actions).to(device) self.value_net_target = Value(self.num_states + self.num_actions).to(device) self.policy_net_target.load_state_dict(self.policy_net.state_dict()) self.value_net_target.load_state_dict(self.value_net.state_dict()) self.optimizer_p = optim.Adam(self.policy_net.parameters(), lr=self.lr_p) self.optimizer_v = optim.Adam(self.value_net.parameters(), lr=self.lr_v) def choose_action(self, state, noise_scale): """select action""" self.policy_net.eval() state = FLOAT(state).unsqueeze(0).to(device) with torch.no_grad(): action = self.policy_net(state) self.policy_net.train() action = action.cpu().numpy()[0] # add noise noise = noise_scale * np.random.randn(self.num_actions) action += noise action = np.clip(action, -self.action_high, self.action_high) return action def eval(self, i_iter, render=False): """evaluate model""" self.policy_net.eval() self.value_net.eval() state = self.env.reset() test_reward = 0 while True: if render: self.env.render() action = self.choose_action(state, 0) state, reward, done, _ = self.env.step(action) test_reward += reward if done: break print(f"Iter: {i_iter}, test Reward: {test_reward}") self.env.close() def learn(self, writer, i_iter, step): """interact""" self.policy_net.train() self.value_net.train() state = self.env.reset() episode_reward = 0 while True: if self.render: self.env.render() action = self.choose_action(state, self.action_noise) next_state, reward, done, _ = self.env.step(action) mask = 0 if done else 1 # ('state', 'action', 'reward', 'next_state', 'mask', 'log_prob') self.memory.push(state, action, reward, next_state, mask) episode_reward += reward if step >= self.min_update_step and step % self.update_step == 0: for _ in range(self.update_step): batch = self.memory.sample( self.batch_size) # random sample batch self.update(batch) if done: break state = next_state self.env.close() print(f"Iter: {i_iter}, reward: {episode_reward}") # record reward information writer.add_scalar("ddpg/reward", episode_reward, i_iter) def update(self, batch): """learn model""" batch_state = FLOAT(batch.state).to(device) batch_action = FLOAT(batch.action).to(device) batch_reward = FLOAT(batch.reward).to(device) batch_next_state = FLOAT(batch.next_state).to(device) batch_mask = FLOAT(batch.mask).to(device) # update by DDPG ddpg_step(self.policy_net, self.policy_net_target, self.value_net, self.value_net_target, self.optimizer_p, self.optimizer_v, batch_state, batch_action, batch_reward, batch_next_state, batch_mask, self.gamma, self.polyak) def load(self, model_path): print(f"Loading Saved Model from {model_path}") self.policy_net, self.value_net = torch.load(model_path, map_location=device) def save(self, save_path): if not os.path.exists(save_path): os.mkdir(save_path) """save model""" torch.save((self.policy_net, self.value_net), f"{save_path}/WebEye_ddpg.pt")
running_state = ZFilter((state_dim, ), clip=5) """ Seeding """ np.random.seed(exp_args["config"]["seed"]) torch.manual_seed(exp_args["config"]["seed"]) env.seed(exp_args["config"]["seed"]) """ define policy(actor) and critic(value function predictor) """ if is_discrete_action_space: policy_net = DiscretePolicy(state_dim, env.action_space.n, exp_args["model"]["hidden"], exp_args["model"]["activation"]) else: raise ValueError( "Policy for Continous Action Space is not implemented yet") value_net = Value(state_dim, exp_args["model"]["hidden"], exp_args["model"]["activation"]) policy_net.to(device) value_net.to(device) optimizer_policy = torch.optim.Adam(policy_net.parameters(), lr=exp_args["config"]["lr"]) optimizer_value = torch.optim.Adam(value_net.parameters(), lr=exp_args["config"]["lr"]) """ Create Agent """ agent = Agent(env, policy_net, device, running_state=running_state, render=exp_args["config"]["render"],
np.random.seed(args.seed) torch.manual_seed(args.seed) if use_gpu: torch.cuda.manual_seed_all(args.seed) env_dummy = env_factory(0) state_dim = env_dummy.observation_space.shape[0] is_disc_action = len(env_dummy.action_space.shape) == 0 action_dim = (1 if is_disc_action else env_dummy.action_space.shape[0]) ActionTensor = LongTensor if is_disc_action else DoubleTensor """define actor, critic and discrimiator""" if is_disc_action: policy_net = DiscretePolicy(state_dim, env_dummy.action_space.n) else: policy_net = Policy(state_dim, env_dummy.action_space.shape[0]) value_net = Value(state_dim) discrim_net = Discriminator(state_dim + action_dim) discrim_criterion = nn.BCELoss() if use_gpu: policy_net = policy_net.cuda() value_net = value_net.cuda() discrim_net = discrim_net.cuda() discrim_criterion = discrim_criterion.cuda() optimizer_policy = torch.optim.Adam(policy_net.parameters(), lr=args.learning_rate) optimizer_value = torch.optim.Adam(value_net.parameters(), lr=args.learning_rate) optimizer_discrim = torch.optim.Adam(discrim_net.parameters(), lr=args.learning_rate)
def setUp(self) -> None: self.value = Value(6, drop_rate=0.5) self.value2 = Value(11, drop_rate=0.5) print(self.value)
# running_reward = ZFilter((1,), demean=False, clip=10) running_state = None """seeding""" np.random.seed(args.seed) torch.manual_seed(args.seed) env.seed(args.seed) """define actor and critic""" if args.model_path is None: if is_disc_action: policy_net = DiscretePolicy(state_dim, env.action_space.n) else: policy_mgr = DiscretePolicy(state_dim, 4) policy_wrk = Policy(state_dim + subgoal_dim, env.action_space.shape[0], log_std=args.log_std) value_mgr = Value(state_dim) value_wrk = Value(state_dim + subgoal_dim) else: policy_net, value_net, running_state = pickle.load( open(args.model_path, "rb")) policy_mgr.to(device) policy_wrk.to(device) value_mgr.to(device) value_wrk.to(device) optim_policy_m = torch.optim.Adam(policy_mgr.parameters(), lr=0.01) optim_policy_w = torch.optim.Adam(policy_wrk.parameters(), lr=0.01) optim_value_m = torch.optim.Adam(value_mgr.parameters(), lr=0.01) optim_value_w = torch.optim.Adam(value_wrk.parameters(), lr=0.01) optim_epochs = 10
np.random.seed(args.seed) torch.manual_seed(args.seed) env.seed(args.seed) p_nets = [] v_nets = [] p_opts = [] v_opts = [] """define actor and critic""" if args.model_path is None: if is_disc_action: for i in range(env.n_agents): p_nets.append( DiscretePolicy(args.dec_agents, env.n_agents, state_dim, env.action_space[0].n)) v_nets.append(Value(env.n_agents, state_dim)) # add only one policy and value networks if using team unified network settings. if args.dec_agents is False: break else: policy_net = Policy(state_dim, env.action_space[0].n, log_std=args.log_std) else: p_nets, v_nets, running_state = pickle.load(open(args.model_path, "rb")) dtype = torch.float64 torch.set_default_dtype(dtype) device = torch.device('cpu') for i in range(env.n_agents):
np.random.seed(args.seed) torch.manual_seed(args.seed) env_dummy = env_factory(0) state_dim = env_dummy.observation_space.shape[0] is_disc_action = len(env_dummy.action_space.shape) == 0 running_state = ZFilter((state_dim, ), clip=5) # running_reward = ZFilter((1,), demean=False, clip=10) """define actor and critic""" if args.model_path is None: if is_disc_action: policy_net = DiscretePolicy(state_dim, env_dummy.action_space.n) else: policy_net = Policy(state_dim, env_dummy.action_space.shape[0], log_std=args.log_std) value_net = Value(state_dim) else: policy_net, value_net, running_state = pickle.load( open(args.model_path, "rb")) policy_net.to(device) value_net.to(device) del env_dummy """create agent""" agent = Agent(env_factory, policy_net, device, running_state=running_state, render=args.render, num_threads=args.num_threads)
def train(**kwargs): print('here') config = { "lr": kwargs['lr'], "gamma": kwargs['gamma'] } dtype = torch.float64 torch.set_default_dtype(dtype) device = torch.device('cuda', index=args.gpu_index) if torch.cuda.is_available() else torch.device('cpu') if torch.cuda.is_available(): torch.cuda.set_device(args.gpu_index) """environment""" env = gym.make(args.env_name) state_dim = env.observation_space.shape[0] is_disc_action = len(env.action_space.shape) == 0 running_state = ZFilter((state_dim,), clip=5) # running_reward = ZFilter((1,), demean=False, clip=10) """seeding""" np.random.seed(args.seed) torch.manual_seed(args.seed) env.seed(args.seed) # """define actor and critic""" if args.model_path is None: if is_disc_action: policy_net = DiscretePolicy(state_dim, env.action_space.n) else: policy_net = Policy(state_dim, env.action_space.shape[0], log_std=args.log_std) value_net = Value(state_dim) else: policy_net, value_net, running_state = pickle.load(open(args.model_path, "rb")) policy_net.to(device) value_net.to(device) # optimization epoch number and batch size for PPO optim_epochs = 10 optim_batch_size = 64 """create agent""" agent = Agent(env, policy_net, device, running_state=running_state, render=args.render, num_threads=args.num_threads) def update_params(batch, i_iter, config): states = torch.from_numpy(np.stack(batch.state)).to(dtype).to(device) actions = torch.from_numpy(np.stack(batch.action)).to(dtype).to(device) rewards = torch.from_numpy(np.stack(batch.reward)).to(dtype).to(device) masks = torch.from_numpy(np.stack(batch.mask)).to(dtype).to(device) with torch.no_grad(): values = value_net(states) fixed_log_probs = policy_net.get_log_prob(states, actions) """get advantage estimation from the trajectories""" advantages, returns = estimate_advantages(rewards, masks, values, config['gamma'], args.tau, device) """perform mini-batch PPO update""" optim_iter_num = int(math.ceil(states.shape[0] / optim_batch_size)) for _ in range(optim_epochs): perm = np.arange(states.shape[0]) np.random.shuffle(perm) perm = LongTensor(perm).to(device) states, actions, returns, advantages, fixed_log_probs = \ states[perm].clone(), actions[perm].clone(), returns[perm].clone(), advantages[perm].clone(), fixed_log_probs[perm].clone() for i in range(optim_iter_num): ind = slice(i * optim_batch_size, min((i + 1) * optim_batch_size, states.shape[0])) states_b, actions_b, advantages_b, returns_b, fixed_log_probs_b = \ states[ind], actions[ind], advantages[ind], returns[ind], fixed_log_probs[ind] ppo_step(policy_net, value_net, optimizer_policy, optimizer_value, 1, states_b, actions_b, returns_b, advantages_b, fixed_log_probs_b, args.clip_epsilon, args.l2_reg) def main_loop(config): optimizer_policy = torch.optim.Adam(policy_net.parameters(), lr=config['lr']) optimizer_value = torch.optim.Adam(value_net.parameters(), lr=config['lr']) for i_iter in range(args.max_iter_num): """generate multiple trajectories that reach the minimum batch_size""" batch, log = agent.collect_samples(args.min_batch_size) t0 = time.time() update_params(batch, i_iter, config) t1 = time.time() if i_iter % args.log_interval == 0: print('{}\tT_sample {:.4f}\tT_update {:.4f}\tR_min {:.2f}\tR_max {:.2f}\tR_avg {:.2f}'.format( i_iter, log['sample_time'], t1-t0, log['min_reward'], log['max_reward'], log['avg_reward'])) if args.save_model_interval > 0 and (i_iter+1) % args.save_model_interval == 0: to_device(torch.device('cpu'), policy_net, value_net) pickle.dump((policy_net, value_net, running_state), open(os.path.join(assets_dir(), 'learned_models/{}_ppo.p'.format(args.env_name)), 'wb')) to_device(device, policy_net, value_net) # """clean up gpu memory""" torch.cuda.empty_cache() return agent.evaluate() print('a') print(config) print(args) return main_loop(config)
np.random.seed(args.seed) torch.manual_seed(args.seed) env_dummy = env_factory(0) state_dim = env_dummy.observation_space.shape[0] is_disc_action = len(env_dummy.action_space.shape) == 0 running_state = ZFilter((state_dim, ), clip=5) # running_reward = ZFilter((1,), demean=False, clip=10) """define actor and critic""" if args.model_path is None: if is_disc_action: policy_net = DiscretePolicy(state_dim, env_dummy.action_space.n) else: policy_net = Policy(state_dim, env_dummy.action_space.shape[0], log_std=args.log_std) value_net = Value(state_dim) else: policy_net, value_net, running_state = pickle.load( open(args.model_path, "rb")) policy_net.to(device) value_net.to(device) del env_dummy optimizer_policy = torch.optim.Adam(policy_net.parameters(), lr=0.01) optimizer_value = torch.optim.Adam(value_net.parameters(), lr=0.01) """create agent""" agent = Agent(env_factory, policy_net, device, running_state=running_state, render=args.render,
class SAC_Alpha: def __init__(self, env, render=False, num_process=1, memory_size=1000000, lr_p=1e-3, lr_a=3e-4, lr_q=1e-3, gamma=0.99, polyak=0.995, batch_size=100, min_update_step=1000, update_step=50, target_update_delay=1, seed=1, ): self.env = env self.gamma = gamma self.polyak = polyak self.memory = FixedMemory(memory_size) self.render = render self.num_process = num_process self.lr_p = lr_p self.lr_a = lr_a self.lr_q = lr_q self.batch_size = batch_size self.min_update_step = min_update_step self.update_step = update_step self.target_update_delay = target_update_delay self.seed = seed self._init_model() def _init_model(self): """init model from parameters""" self.num_states = self.env.observation_space.shape[0] self.num_actions = self.env.action_space.shape[0] self.action_low, self.action_high = self.env.action_space.low[0], self.env.action_space.high[0] self.target_entropy = - np.prod(self.env.action_space.shape) # seeding np.random.seed(self.seed) torch.manual_seed(self.seed) self.env.seed(self.seed) self.policy_net = Actor(self.num_states, self.num_actions, action_limit=self.action_high).to(device) self.q_net_1 = Value(self.num_states + self.num_actions).to(device) self.q_net_target_1 = Value(self.num_states + self.num_actions).to(device) self.q_net_2 = Value(self.num_states + self.num_actions).to(device) self.q_net_target_2 = Value(self.num_states + self.num_actions).to(device) # self.alpha init self.alpha = torch.exp(torch.zeros(1, device=device)).requires_grad_() self.q_net_target_1.load_state_dict(self.q_net_1.state_dict()) self.q_net_target_2.load_state_dict(self.q_net_2.state_dict()) self.optimizer_p = optim.Adam(self.policy_net.parameters(), lr=self.lr_p) self.optimizer_a = optim.Adam([self.alpha], lr=self.lr_a) self.optimizer_q_1 = optim.Adam(self.q_net_1.parameters(), lr=self.lr_q) self.optimizer_q_2 = optim.Adam(self.q_net_2.parameters(), lr=self.lr_q) def choose_action(self, state): """select action""" state = FLOAT(state).unsqueeze(0).to(device) with torch.no_grad(): action, _ = self.policy_net.get_action_log_prob(state) action = action.cpu().numpy()[0] return action, None def eval(self, i_iter, render=False): """evaluate model""" state = self.env.reset() test_reward = 0 while True: if render: self.env.render() action, _ = self.choose_action(state) state, reward, done, _ = self.env.step(action) test_reward += reward if done: break print(f"Iter: {i_iter}, test Reward: {test_reward}") self.env.close() def learn(self, writer, i_iter, step): """interact""" state = self.env.reset() episode_reward = 0 while True: if self.render: self.env.render() action, _ = self.choose_action(state) next_state, reward, done, _ = self.env.step(action) mask = 0 if done else 1 # ('state', 'action', 'reward', 'next_state', 'mask') self.memory.push(state, action, reward, next_state, mask) episode_reward += reward if step >= self.min_update_step and step % self.update_step == 0: for k in range(1, self.update_step + 1): batch = self.memory.sample(self.batch_size) # random sample batch self.update(batch, k) if done: break state = next_state self.env.close() print(f"Iter: {i_iter}, reward: {episode_reward}") # record reward information writer.add_scalar("sac_alpha/reward", episode_reward, i_iter) def update(self, batch, k_iter): """learn model""" batch_state = FLOAT(batch.state).to(device) batch_action = FLOAT(batch.action).to(device) batch_reward = FLOAT(batch.reward).to(device) batch_next_state = FLOAT(batch.next_state).to(device) batch_mask = FLOAT(batch.mask).to(device) # update by SAC Alpha sac_alpha_step(self.policy_net, self.q_net_1, self.q_net_2, self.alpha, self.q_net_target_1, self.q_net_target_2, self.optimizer_p, self.optimizer_q_1, self.optimizer_q_2, self.optimizer_a, batch_state, batch_action, batch_reward, batch_next_state, batch_mask, self.gamma, self.polyak, self.target_entropy, k_iter % self.target_update_delay == 0) def load(self, model_path): print(f"Loading Saved Model from {model_path}") self.policy_net, self.q_net_1, self.q_net_2, self.alpha = torch.load(model_path, map_location=device) def save(self, save_path): """save model""" if not os.path.exists(save_path): os.mkdir(save_path) torch.save((self.policy_net, self.q_net_1, self.q_net_2, self.alpha), f"{save_path}/WebEye_sac_alpha.pt")