def __init__(self, alpha, beta, input_dims, tau, env, env_id, gamma=0.99, n_actions=2, max_size=1000000, layer1_size=256, layer2_size=256, batch_size=100, reward_scale=2): self.gamma = gamma self.tau = tau self.memory = ReplayBuffer(max_size, input_dims, n_actions) self.batch_size = batch_size self.n_actions = n_actions self.actor = ActorNetwork(alpha, input_dims, layer1_size, layer2_size, n_actions=n_actions, name=env_id+'_actor', max_action=env.action_space.high) self.critic_1 = CriticNetwork(beta, input_dims, layer1_size, layer2_size, n_actions=n_actions, name=env_id+'_critic_1') self.critic_2 = CriticNetwork(beta, input_dims, layer1_size, layer2_size, n_actions=n_actions, name=env_id+'_critic_2') self.value = ValueNetwork(beta, input_dims, layer1_size, layer2_size, name=env_id+'_value') self.target_value = ValueNetwork(beta, input_dims, layer1_size, layer2_size, name=env_id+'_target_value') self.scale = reward_scale self.update_network_parameters(tau=1)
def __init__(self, alpha=0.0003, beta=0.0003, input_dims=[8], env=None, gamma=0.99, n_actions=2, max_size=1000000, tau=0.005, ent_alpha=0.02, batch_size=256, reward_scale=2, layer1_size=256, layer2_size=256, chkpt_dir='tmp/sac'): self.gamma = gamma self.tau = tau self.memory = ReplayBuffer(max_size, input_dims, n_actions) self.batch_size = batch_size self.n_actions = n_actions self.ent_alpha = ent_alpha self.actor = ActorNetwork(alpha, input_dims, n_actions=n_actions, fc1_dims=layer1_size, fc2_dims=layer2_size, name='actor', max_action=env.action_space.high, chkpt_dir=chkpt_dir) self.critic_1 = CriticNetwork(beta, input_dims, n_actions=n_actions, fc1_dims=layer1_size, fc2_dims=layer2_size, name='critic_1', chkpt_dir=chkpt_dir) self.critic_2 = CriticNetwork(beta, input_dims, n_actions=n_actions, fc1_dims=layer1_size, fc2_dims=layer2_size, name='critic_2', chkpt_dir=chkpt_dir) self.value = ValueNetwork(beta, input_dims, fc1_dims=layer1_size, fc2_dims=layer2_size, name='value', chkpt_dir=chkpt_dir) self.target_value = ValueNetwork(beta, input_dims, fc1_dims=layer1_size, fc2_dims=layer2_size, name='target_value', chkpt_dir=chkpt_dir) self.scale = reward_scale self.update_network_parameters(tau=1)
def __init__(self, input_dims, env, n_actions): self.memory = ReplayBuffer(input_dims) self.n_actions = n_actions self.actor_nn = ActorNetwork(input_dims, n_actions=n_actions, name=Constants.env_id + '_actor', max_action=env.action_space.n) self.critic_local_1_nn = CriticNetwork(input_dims, n_actions=n_actions, name=Constants.env_id + '_critic_local_1') self.critic_local_2_nn = CriticNetwork(input_dims, n_actions=n_actions, name=Constants.env_id + '_critic_local_2') self.critic_target_1_nn = CriticNetwork(input_dims, n_actions=n_actions, name=Constants.env_id + '_critic_target_1') self.critic_target_2_nn = CriticNetwork(input_dims, n_actions=n_actions, name=Constants.env_id + '_critic_target_2') self.value_nn = ValueNetwork(input_dims, name=Constants.env_id + '_value') self.target_value_nn = ValueNetwork(input_dims, name=Constants.env_id + '_target_value') self.update_network_parameters(tau=1)
def __init__(self, alpha=3e-4, beta=3e-4, input_dims=[8], env=None, gamma=0.99, n_actions=2, max_size=1000000, tau=5e-3, fc1_dim=256, fc2_dim=256, batch_size=256, reward_scale=2): self.gamma = gamma self.tau = tau self.memory = ReplayBuffer(max_size, input_dims, n_actions) self.batch_size = batch_size self.n_actions = n_actions self.actor = ActorNetwork(alpha, input_dims, n_actions, env.action_space.high) self.critic1 = CriticNetwork(beta, input_dims, n_actions, name='critic1') self.critic2 = CriticNetwork(beta, input_dims, n_actions, name='critic2') self.value = ValueNetwork(beta, input_dims, name='value') self.target_value = ValueNetwork(beta, input_dims, name='target_value') self.scale = reward_scale self.update_network_parameters(tau=1)
def __init__(self, alpha=0.0003, beta=0.0003, input_dims=[8], env=None, gamma=0.99, n_actions=2, max_size=1000000, tau=0.005, layer1_size=256, layer2_size=256, batch_size=256, reward_scale=2): self.gamma = gamma self.tau = tau self.memory = ReplayBuffer(max_size, input_dims, n_actions) self.batch_size = batch_size self.n_actions = n_actions self.actor = ActorNetwork(n_actions=n_actions, name='actor', max_action=env.action_space.high) self.critic_1 = CriticNetwork(n_actions=n_actions, name='critic_1') self.critic_2 = CriticNetwork(n_actions=n_actions, name='critic_2') self.value = ValueNetwork(name='value') self.target_value = ValueNetwork(name='target_value') self.actor.compile(optimizer=Adam(learning_rate=alpha)) self.critic_1.compile(optimizer=Adam(learning_rate=beta)) self.critic_2.compile(optimizer=Adam(learning_rate=beta)) self.value.compile(optimizer=Adam(learning_rate=beta)) self.target_value.compile(optimizer=Adam(learning_rate=beta)) self.scale = reward_scale self.update_network_parameters(tau=1)
def __init__(self, alpha, beta, input_dims, tau, gamma=0.99, max_action=1.0, \ n_actions=2, max_size=1000000, layer1_size=400, \ layer2_size=300, batch_size=100, reward_scale=2, path_dir='model/sac'): self.gamma = gamma self.tau = tau self.memory = ReplayBuffer(max_size, input_dims, n_actions) self.batch_size = batch_size self.n_actions = n_actions self.actor = ActorNetwork(alpha, input_dims, layer1_size, layer2_size, n_actions=n_actions, name='_actor', max_action=max_action, chkpt_dir=path_dir) self.critic_1 = CriticNetwork(beta, input_dims, layer1_size, layer2_size, n_actions=n_actions, name='_critic_1', chkpt_dir=path_dir) self.critic_2 = CriticNetwork(beta, input_dims, layer1_size, layer2_size, n_actions=n_actions, name='_critic_2', chkpt_dir=path_dir) self.value = ValueNetwork(beta, input_dims, layer1_size, layer2_size, name='_value', chkpt_dir=path_dir) self.target_value = ValueNetwork(beta, input_dims, layer1_size, layer2_size, name='_target_value', chkpt_dir=path_dir) self.scale = reward_scale self.update_network_parameters(tau=1)
def __init__(self, alpha=0.0003, beta=0.0003, input_dims=[8], env=None, gamma=0.99, n_actions=2, max_size=1000000, tau=0.005, layer1_size=256, layer2_size=256, batch_size=256, reward_scale=2): self.gamma = gamma self.tau = tau self.memory = ReplayBuffer(max_size, input_dims, n_actions) self.batch_size = batch_size self.n_actions = n_actions self.actor = ActorNetwork(alpha, input_dims, n_actions=n_actions, name='actor', max_action=env.action_space.high) self.critic_1 = CriticNetwork(beta, input_dims, n_actions=n_actions, name='critic_1') self.critic_2 = CriticNetwork(beta, input_dims, n_actions=n_actions, name='critic_2') self.value = ValueNetwork(beta, input_dims, name='value') self.target_value = ValueNetwork(beta, input_dims, name='target_value') self.scale = reward_scale self.update_network_parameters(tau=1) #sets the parameters of Target-network equals to the
def __init__(self, alpha=0.00005, beta=0.00005, input_dims=5, env=None, gamma=0.99, n_actions=2, max_size=1000000, tau=0.005, layer1_size=256, layer2_size=256, batch_size=256, reward_scale=2): self.gamma = 0.99 self.tau = tau self.memeory = ReplayBuffer(max_size, input_dims, n_actions) self.batch_size = batch_size self.n_actions = n_actions latent_dims = 10 self.actor = ActorNetwork_2(alpha, latent_dims, env.action_space.high, n_actions=n_actions) self.critic_1 = CriticNetwork(beta, latent_dims, n_actions, name='critic_det_1') self.critic_2 = CriticNetwork(beta, latent_dims, n_actions, name='critic__det_2') self.value = ValueNetwork(beta, latent_dims, name='value_det') self.target_value = ValueNetwork(beta, latent_dims, name='target_value_det') self.VAE = LinearVAE() self.scale = reward_scale self.update_network_parameters(tau=1)
def __init__(self, env_id: str, config: Config, pid: int = None, epsilon: float = 0., summary_writer: tf.summary.SummaryWriter = None): self.env_id = env_id self.config = config self.pid = pid self.epsilon = epsilon self.summary_writer = summary_writer self.action_space = gym.make(self.env_id).action_space.n self.preprocess_func = util.get_preprocess_func(env_name=self.env_id) self.buffer = EpisodeBuffer(seqlen=self.config.sequence_length) self.world_model = WorldModel(config) self.wm_optimizer = tf.keras.optimizers.Adam(lr=self.config.lr_world, epsilon=1e-4) self.policy = PolicyNetwork(action_space=self.action_space) self.policy_optimizer = tf.keras.optimizers.Adam( lr=self.config.lr_actor, epsilon=1e-5) self.value = ValueNetwork(action_space=self.action_space) self.target_value = ValueNetwork(action_space=self.action_space) self.value_optimizer = tf.keras.optimizers.Adam( lr=self.config.lr_critic, epsilon=1e-5) self.setup()
def __init__(self, alpha=0.0003, beta=0.0003, input_dims=[8], max_action=1, gamma=0.99, n_actions=2, max_size=1000000, tau=0.005, layer1_size=512, layer2_size=512, batch_size=512, reward_scale=2): self.gamma = gamma self.tau = tau self.memory = ReplayBuffer(max_size, input_dims, n_actions) self.batch_size = batch_size self.n_actions = n_actions self.actor = ActorNetwork(alpha, input_dims, n_actions=n_actions, name='actor', max_action=max_action) self.critic_1 = CriticNetwork(beta, input_dims, n_actions=n_actions, name='critic_1') self.critic_2 = CriticNetwork(beta, input_dims, n_actions=n_actions, name='critic_2') self.value = ValueNetwork(beta, input_dims, name='value') self.target_value = ValueNetwork(beta, input_dims, name='target_value') self.scale = reward_scale self.update_network_parameters(tau=1)
class Agent(): def __init__(self, alpha=0.0003, beta=0.0003, input_dims=[8], env=None, gamma=0.99, n_actions=2, max_size=1000000, tau=0.005, layer1_size=256, layer2_size=256, batch_size=256, reward_scale=2): self.gamma = gamma self.tau = tau self.memory = ReplayBuffer(max_size, input_dims, n_actions) self.batch_size = batch_size self.n_actions = n_actions self.actor = ActorNetwork(alpha, input_dims, n_actions=n_actions, name='actor', max_action=env.action_space.high) self.critic_1 = CriticNetwork(beta, input_dims, n_actions=n_actions, name='critic_1') self.critic_2 = CriticNetwork(beta, input_dims, n_actions=n_actions, name='critic_2') self.value = ValueNetwork(beta, input_dims, name='value') self.target_value = ValueNetwork(beta, input_dims, name='target_value') self.scale = reward_scale self.update_network_parameters(tau=1) def choose_action(self, observation): state = T.Tensor([observation]).to(self.actor.device) actions, _ = self.actor.sample_normal(state, reparameterize=False) return actions.cpu().detach().numpy()[0] def remember(self, state, action, reward, new_state, done): self.memory.store_transition(state, action, reward, new_state, done) def update_network_parameters(self, tau=None): if tau is None: tau = self.tau target_value_params = self.target_value.named_parameters() value_params = self.value.named_parameters() target_value_state_dict = dict(target_value_params) value_state_dict = dict(value_params) for name in value_state_dict: value_state_dict[name] = tau*value_state_dict[name].clone() + \ (1-tau)*target_value_state_dict[name].clone() self.target_value.load_state_dict(value_state_dict) def save_models(self): print('.... saving models ....') self.actor.save_checkpoint() self.value.save_checkpoint() self.target_value.save_checkpoint() self.critic_1.save_checkpoint() self.critic_2.save_checkpoint() def load_models(self): print('.... loading models ....') self.actor.load_checkpoint() self.value.load_checkpoint() self.target_value.load_checkpoint() self.critic_1.load_checkpoint() self.critic_2.load_checkpoint() def learn(self): if self.memory.mem_cntr < self.batch_size: return state, action, reward, new_state, done = \ self.memory.sample_buffer(self.batch_size) reward = T.tensor(reward, dtype=T.float).to(self.actor.device) done = T.tensor(done).to(self.actor.device) state_ = T.tensor(new_state, dtype=T.float).to(self.actor.device) state = T.tensor(state, dtype=T.float).to(self.actor.device) action = T.tensor(action, dtype=T.float).to(self.actor.device) value = self.value(state).view(-1) value_ = self.target_value(state_).view(-1) value_[done] = 0.0 actions, log_probs = self.actor.sample_normal(state, reparameterize=False) log_probs = log_probs.view(-1) q1_new_policy = self.critic_1.forward(state, actions) q2_new_policy = self.critic_2.forward(state, actions) critic_value = T.min(q1_new_policy, q2_new_policy) critic_value = critic_value.view(-1) self.value.optimizer.zero_grad() value_target = critic_value - log_probs value_loss = 0.5 * F.mse_loss(value, value_target) value_loss.backward(retain_graph=True) self.value.optimizer.step() actions, log_probs = self.actor.sample_normal(state, reparameterize=True) log_probs = log_probs.view(-1) q1_new_policy = self.critic_1.forward(state, actions) q2_new_policy = self.critic_2.forward(state, actions) critic_value = T.min(q1_new_policy, q2_new_policy) critic_value = critic_value.view(-1) actor_loss = log_probs - critic_value actor_loss = T.mean(actor_loss) self.actor.optimizer.zero_grad() actor_loss.backward(retain_graph=True) self.actor.optimizer.step() self.critic_1.optimizer.zero_grad() self.critic_2.optimizer.zero_grad() q_hat = self.scale * reward + self.gamma * value_ q1_old_policy = self.critic_1.forward(state, action).view(-1) q2_old_policy = self.critic_2.forward(state, action).view(-1) critic_1_loss = 0.5 * F.mse_loss(q1_old_policy, q_hat) critic_2_loss = 0.5 * F.mse_loss(q2_old_policy, q_hat) critic_loss = critic_1_loss + critic_2_loss critic_loss.backward() self.critic_1.optimizer.step() self.critic_2.optimizer.step() self.update_network_parameters()
# the number of training epoches num_of_epoch = 8 # the number of batch size for gradient descent when training batch_sz = 64 # set up the criterion criterion = nn.CrossEntropyLoss().to(device) # set up models clstm = CNN_LSTM(INPUT_DIM, EMBEDDING_DIM, KER_SIZE, N_FILTERS, HIDDEN_DIM).to(device) print(clstm) policy_s = Policy_S(HIDDEN_DIM, HIDDEN_DIM, OUTPUT_DIM).to(device) policy_n = Policy_N(HIDDEN_DIM, HIDDEN_DIM, MAX_K).to(device) policy_c = Policy_C(HIDDEN_DIM, HIDDEN_DIM, LABEL_DIM).to(device) value_net = ValueNetwork(HIDDEN_DIM, HIDDEN_DIM, OUTPUT_DIM).to(device) # set up optimiser params_pg = list(policy_s.parameters()) + list(policy_c.parameters()) + list( policy_n.parameters()) optim_loss = optim.Adam(clstm.parameters(), lr=learning_rate) optim_policy = optim.Adam(params_pg, lr=learning_rate) optim_value = optim.Adam(value_net.parameters(), lr=learning_rate) # add pretrained embeddings pretrained_embeddings = TEXT.vocab.vectors clstm.embedding.weight.data.copy_(pretrained_embeddings) clstm.embedding.weight.requires_grad = True # update the initial weights # set the default tensor type for GPU #torch.set_default_tensor_type('torch.cuda.FloatTensor')
class Agent_sm(): def __init__(self, alpha=0.0003, beta=0.0003, input_dims=8, env=None, gamma=0.99, n_actions=2, max_size=1000000, tau=0.005, layer1_size=256, layer2_size=256, batch_size=256, reward_scale=2): self.gamma = 0.99 self.tau = tau self.memeory = ReplayBuffer(max_size, input_dims, n_actions) self.batch_size = batch_size self.n_actions = n_actions self.actor = ActorNetwork(alpha, input_dims, env.action_space.high, n_actions=n_actions) self.critic_1 = CriticNetwork(beta, input_dims, n_actions, name='critic_1') self.critic_2 = CriticNetwork(beta, input_dims, n_actions, name='critic_2') self.value = ValueNetwork(beta, input_dims, name='value') self.target_value = ValueNetwork(beta, input_dims, name='target_value') self.scale = reward_scale self.update_network_parameters(tau=1) def choose_action(self, observation): state = torch.Tensor([observation]).to(self.actor.device) actions, _ = self.actor.sample_normal(state, reparameterize=False) return actions.cpu().detach().numpy()[0] def remember(self, state, action, reward, new_state, done): self.memeory.store_transition(state, action, reward, new_state, done) def update_network_parameters(self, tau=None): if tau is None: tau = self.tau target_value_params = self.target_value.named_parameters() value_params = self.value.named_parameters() target_value_dict = dict(target_value_params) value_dict = dict(value_params) for name in target_value_dict: target_value_dict[name] = tau*value_dict[name].clone() + \ (1-tau)*target_value_dict[name].clone() self.target_value.load_state_dict(target_value_dict) def save_models(self): print('... saving models ...') self.actor.save_checkpoint() self.critic_1.save_checkpoint() self.critic_2.save_checkpoint() self.value.save_checkpoint() self.target_value.save_checkpoint() def load_models(self): print('... loading models ...') self.actor.load_checkpoint() self.critic_1.load_checkpoint() self.critic_2.load_checkpoint() self.value.load_checkpoint() self.target_value.load_checkpoint() def learn(self): if self.memeory.mem_cntr < self.batch_size: return states, new_states, actions, rewards, dones = self.memeory.sample_buffer( self.batch_size) states = torch.tensor(states, dtype=torch.float).to(self.actor.device) new_states = torch.tensor(new_states, dtype=torch.float).to(self.actor.device) actions = torch.tensor(actions, dtype=torch.float).to(self.actor.device) rewards = torch.tensor(rewards, dtype=torch.float).to(self.actor.device) dones = torch.tensor(dones).to(self.actor.device) states_value = self.value(states).view(-1) new_states_value = self.target_value(new_states).view(-1) new_states_value[dones] = 0.0 action, log_probs = self.actor.sample_normal(states, reparameterize=False) log_probs = log_probs.view(-1) q1_new_policy = self.critic_1(states, action) q2_new_policy = self.critic_2(states, action) critic_value = torch.min(q1_new_policy, q2_new_policy) critic_value = critic_value.view(-1) self.value.optimizer.zero_grad() value_target = critic_value - log_probs value_loss = 0.5 * F.mse_loss(states_value, value_target) value_loss.backward(retain_graph=True) self.value.optimizer.step() action, log_probs = self.actor.sample_normal(states, reparameterize=True) log_probs = log_probs.view(-1) q1_new_policy = self.critic_1(states, action) q2_new_policy = self.critic_2(states, action) critic_value = torch.min(q1_new_policy, q2_new_policy) critic_value = critic_value.view(-1) actor_loss = log_probs - critic_value actor_loss = torch.mean(actor_loss) self.actor.optimizer.zero_grad() actor_loss.backward(retain_graph=True) self.actor.optimizer.step() self.critic_1.optimizer.zero_grad() self.critic_2.optimizer.zero_grad() q_hat = self.scale * rewards + self.gamma * new_states_value q1_old_policy = self.critic_1(states, actions).view(-1) q2_old_policy = self.critic_2(states, actions).view(-1) critic1_loss = 0.5 * F.mse_loss(q1_old_policy, q_hat) critic2_loss = 0.5 * F.mse_loss(q2_old_policy, q_hat) critic_loss = critic1_loss + critic2_loss critic_loss.backward() self.critic_1.optimizer.step() self.critic_2.optimizer.step() self.update_network_parameters() # value_loss = value_loss.cpu().detach().numpy()[0] # actor_loss = actor_loss.cpu().detach().numpy()[0] # critic_loss = critic_loss.cpu().detach().numpy()[0] return 0, value_loss, actor_loss, critic_loss def learn_sm(self, sm_reg=1): if self.memeory.mem_cntr < self.batch_size: return states, new_states, actions, rewards, dones = self.memeory.sample_buffer( self.batch_size) states = torch.tensor(states, dtype=torch.float).to(self.actor.device) new_states = torch.tensor(new_states, dtype=torch.float).to(self.actor.device) actions = torch.tensor(actions, dtype=torch.float).to(self.actor.device) rewards = torch.tensor(rewards, dtype=torch.float).to(self.actor.device) dones = torch.tensor(dones).to(self.actor.device) states_value = self.value(states).view(-1) new_states_value = self.target_value(new_states).view(-1) new_states_value[dones] = 0.0 # action, log_probs = self.actor.sample_normal(states, reparameterize=False) # log_probs = log_probs.view(-1) # q1_new_policy = self.critic_1(states, action) # q2_new_policy = self.critic_2(states, action) # critic_value = torch.min(q1_new_policy, q2_new_policy) # critic_value = critic_value.view(-1) # self.value.optimizer.zero_grad() # value_target = critic_value - log_probs # value_loss = 0.5 * F.mse_loss(states_value, value_target) # value_loss.backward(retain_graph=True) # self.value.optimizer.step() # action, log_probs = self.actor.sample_normal(states, reparameterize=True) action, _ = self.actor.sample_normal(states, reparameterize=True) # log_probs = log_probs.view(-1) q1_new_policy = self.critic_1(states, action) q2_new_policy = self.critic_2(states, action) critic_value = torch.min(q1_new_policy, q2_new_policy) critic_value = critic_value.view(-1) # sample actions for next batch states action_next, _ = self.actor.sample_normal(new_states, reparameterize=True) q1_new_policy = self.critic_1(new_states, action_next) q2_new_policy = self.critic_2(new_states, action_next) critic_value_next = torch.min(q1_new_policy, q2_new_policy) critic_value_next = critic_value.view(-1) # actor_loss = log_probs - critic_value actor_loss = -(critic_value + critic_value_next) + sm_reg * F.mse_loss( action, action_next) actor_loss = torch.mean(actor_loss) self.actor.optimizer.zero_grad() actor_loss.backward(retain_graph=True) self.actor.optimizer.step() # self.critic_1.optimizer.zero_grad() # self.critic_2.optimizer.zero_grad() # q_hat = self.scale*rewards + self.gamma*new_states_value # q1_old_policy = self.critic_1(states, actions).view(-1) # q2_old_policy = self.critic_2(states, actions).view(-1) # critic1_loss = 0.5 * F.mse_loss(q1_old_policy, q_hat) # critic2_loss = 0.5 * F.mse_loss(q2_old_policy, q_hat) # critic_loss = critic1_loss + critic2_loss # critic_loss.backward() # self.critic_1.optimizer.step() # self.critic_2.optimizer.step() # self.update_network_parameters() return 0, 0, actor_loss, 0
class Agent_2(): def __init__(self, alpha=0.00005, beta=0.00005, input_dims=5, env=None, gamma=0.99, n_actions=2, max_size=1000000, tau=0.005, layer1_size=256, layer2_size=256, batch_size=256, reward_scale=2): self.gamma = 0.99 self.tau = tau self.memeory = ReplayBuffer(max_size, input_dims, n_actions) self.batch_size = batch_size self.n_actions = n_actions latent_dims = 10 self.actor = ActorNetwork_2(alpha, latent_dims, env.action_space.high, n_actions=n_actions) self.critic_1 = CriticNetwork(beta, latent_dims, n_actions, name='critic_det_1') self.critic_2 = CriticNetwork(beta, latent_dims, n_actions, name='critic__det_2') self.value = ValueNetwork(beta, latent_dims, name='value_det') self.target_value = ValueNetwork(beta, latent_dims, name='target_value_det') self.VAE = LinearVAE() self.scale = reward_scale self.update_network_parameters(tau=1) def choose_action(self, observation): state = torch.Tensor([observation]).to(self.actor.device) state_latent = self.VAE.sample_normal(state) actions = self.actor(state_latent) return actions.cpu().detach().numpy()[0] def remember(self, state, action, reward, new_state, done): self.memeory.store_transition(state, action, reward, new_state, done) def update_network_parameters(self, tau=None): if tau is None: tau = self.tau target_value_params = self.target_value.named_parameters() value_params = self.value.named_parameters() target_value_dict = dict(target_value_params) value_dict = dict(value_params) for name in target_value_dict: target_value_dict[name] = tau*value_dict[name].clone() + \ (1-tau)*target_value_dict[name].clone() self.target_value.load_state_dict(target_value_dict) def save_models(self): print('... saving models ...') self.actor.save_checkpoint() self.critic_1.save_checkpoint() self.critic_2.save_checkpoint() self.value.save_checkpoint() self.target_value.save_checkpoint() def load_models(self): print('... loading models ...') self.actor.load_checkpoint() self.critic_1.load_checkpoint() self.critic_2.load_checkpoint() self.value.load_checkpoint() self.target_value.load_checkpoint() def learn(self): if self.memeory.mem_cntr < self.batch_size: return states, new_states, actions, rewards, dones = self.memeory.sample_buffer( self.batch_size) states = torch.tensor(states, dtype=torch.float).to(self.actor.device) new_states = torch.tensor(new_states, dtype=torch.float).to(self.actor.device) actions = torch.tensor(actions, dtype=torch.float).to(self.actor.device) rewards = torch.tensor(rewards, dtype=torch.float).to(self.actor.device) dones = torch.tensor(dones).to(self.actor.device) # Train VAE with KL divergence + reconstruction_loss + log_probs reconstruction, mu, logvar, log_probs = self.VAE(states) KLD = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp()) reconstruction_loss = F.mse_loss(reconstruction, states) final_loss = KLD + reconstruction_loss self.VAE.optimizer.zero_grad() final_loss.backward(retain_graph=True) self.VAE.optimizer.step() latent_states = self.VAE.sample_normal(states) states_value = self.value(latent_states).view(-1) new_latent_states = self.VAE.sample_normal(new_states) new_states_value = self.target_value(new_latent_states).view(-1) new_states_value[dones] = 0.0 action = self.actor(latent_states) q1_new_policy = self.critic_1(latent_states, action) q2_new_policy = self.critic_2(latent_states, action) critic_value = torch.min(q1_new_policy, q2_new_policy) critic_value = critic_value.view(-1) self.value.optimizer.zero_grad() value_target = critic_value value_loss = 0.5 * F.mse_loss(states_value, value_target) value_loss.backward(retain_graph=True) self.value.optimizer.step() actor_loss = -critic_value actor_loss = torch.mean(actor_loss) self.actor.optimizer.zero_grad() actor_loss.backward(retain_graph=True) self.actor.optimizer.step() self.critic_1.optimizer.zero_grad() self.critic_2.optimizer.zero_grad() q_hat = self.scale * rewards + self.gamma * new_states_value q1_old_policy = self.critic_1(latent_states, actions).view(-1) q2_old_policy = self.critic_2(latent_states, actions).view(-1) critic1_loss = 0.5 * F.mse_loss(q1_old_policy, q_hat) critic2_loss = 0.5 * F.mse_loss(q2_old_policy, q_hat) critic_loss = critic1_loss + critic2_loss critic_loss.backward() self.critic_1.optimizer.step() self.critic_2.optimizer.step() self.update_network_parameters() return final_loss, value_loss, actor_loss, critic_loss
class Agent(): def __init__(self, alpha=0.0003, beta=0.0003, input_dims=8, env=None, gamma=0.99, n_actions=2, max_size=1000000, tau=0.005, layer1_size=256, layer2_size=256, batch_size=256, reward_scale=2): self.gamma = 0.99 self.tau = tau self.memeory = ReplayBuffer(max_size, input_dims, n_actions) self.batch_size = batch_size self.n_actions = n_actions self.actor = ActorNetwork(alpha, input_dims, env.action_space.high, n_actions=n_actions) self.critic_1 = CriticNetwork(beta, input_dims, n_actions, name='critic_1') self.critic_2 = CriticNetwork(beta, input_dims, n_actions, name='critic_2') self.value = ValueNetwork(beta, input_dims, name='value') self.target_value = ValueNetwork(beta, input_dims, name='target_value') self.scale = reward_scale self.update_network_parameters(tau=1) def choose_action(self, observation): state = torch.Tensor([observation]).to(self.actor.device) actions, _ = self.actor.sample_normal(state, reparameterize=False) return actions.cpu().detach().numpy()[0] def remember(self, state, action, reward, new_state, done): self.memeory.store_transition(state, action, reward, new_state, done) def update_network_parameters(self, tau=None): if tau is None: tau = self.tau target_value_params = self.target_value.named_parameters() value_params = self.value.named_parameters() target_value_dict = dict(target_value_params) value_dict = dict(value_params) for name in target_value_dict: target_value_dict[name] = tau*value_dict[name].clone() + \ (1-tau)*target_value_dict[name].clone() self.target_value.load_state_dict(target_value_dict) def save_models(self): print('... saving models ...') self.actor.save_checkpoint() self.critic_1.save_checkpoint() self.critic_2.save_checkpoint() self.value.save_checkpoint() self.target_value.save_checkpoint() def load_models(self): print('... loading models ...') self.actor.load_checkpoint() self.critic_1.load_checkpoint() self.critic_2.load_checkpoint() self.value.load_checkpoint() self.target_value.load_checkpoint() def learn(self): if self.memeory.mem_cntr < self.batch_size: return states, new_states, actions, rewards, dones = self.memeory.sample_buffer( self.batch_size) states = torch.tensor(states, dtype=torch.float).to(self.actor.device) new_states = torch.tensor(new_states, dtype=torch.float).to(self.actor.device) actions = torch.tensor(actions, dtype=torch.float).to(self.actor.device) rewards = torch.tensor(rewards, dtype=torch.float).to(self.actor.device) dones = torch.tensor(dones).to(self.actor.device) states_value = self.value(states).view(-1) new_states_value = self.target_value(new_states).view(-1) new_states_value[dones] = 0.0 action, log_probs = self.actor.sample_normal(states, reparameterize=False) log_probs = log_probs.view(-1) q1_new_policy = self.critic_1(states, action) q2_new_policy = self.critic_2(states, action) critic_value = torch.min(q1_new_policy, q2_new_policy) critic_value = critic_value.view(-1) self.value.optimizer.zero_grad() value_target = critic_value - log_probs value_loss = 0.5 * F.mse_loss(states_value, value_target) value_loss.backward(retain_graph=True) self.value.optimizer.step() action, log_probs = self.actor.sample_normal(states, reparameterize=True) log_probs = log_probs.view(-1) q1_new_policy = self.critic_1(states, action) q2_new_policy = self.critic_2(states, action) critic_value = torch.min(q1_new_policy, q2_new_policy) critic_value = critic_value.view(-1) actor_loss = log_probs - critic_value actor_loss = torch.mean(actor_loss) self.actor.optimizer.zero_grad() actor_loss.backward(retain_graph=True) self.actor.optimizer.step() self.critic_1.optimizer.zero_grad() self.critic_2.optimizer.zero_grad() q_hat = self.scale * rewards + self.gamma * new_states_value q1_old_policy = self.critic_1(states, actions).view(-1) q2_old_policy = self.critic_2(states, actions).view(-1) critic1_loss = 0.5 * F.mse_loss(q1_old_policy, q_hat) critic2_loss = 0.5 * F.mse_loss(q2_old_policy, q_hat) critic_loss = critic1_loss + critic2_loss critic_loss.backward() self.critic_1.optimizer.step() self.critic_2.optimizer.step() self.update_network_parameters() return value_loss, actor_loss, critic_loss def train_on_env(self, env): rewards = [] done = False observation = env.reset() while not done: action = self.choose_action(observation) observation_, reward, done, _ = env.step(action) self.remember(observation, action, reward, observation_, done) #if not load_checkpoints: self.learn() observation = observation_ rewards.append(reward) return np.sum(rewards) def generate_session(self, env, t_max=1000): states, traj_probs, actions, rewards = [], [], [], [] s = env.reset() q_t = 0 for t in range(t_max): state = torch.Tensor([s]).to(self.actor.device) action, log_probs = self.actor.sample_normal(state, reparameterize=False) action = action.cpu().detach().numpy()[0] new_s, r, done, info = env.step(action) log_probs = log_probs.cpu().detach().numpy()[0] #q_t *= probs q_t += log_probs[0] states.append(s.tolist()) traj_probs.append(q_t) actions.append(action[0]) rewards.append(r) s = new_s if done: break return np.array(states), np.array(traj_probs), np.array( actions), np.array(rewards)
class DreamerV2Agent: def __init__(self, env_id: str, config: Config, pid: int = None, epsilon: float = 0., summary_writer: tf.summary.SummaryWriter = None): self.env_id = env_id self.config = config self.pid = pid self.epsilon = epsilon self.summary_writer = summary_writer self.action_space = gym.make(self.env_id).action_space.n self.preprocess_func = util.get_preprocess_func(env_name=self.env_id) self.buffer = EpisodeBuffer(seqlen=self.config.sequence_length) self.world_model = WorldModel(config) self.wm_optimizer = tf.keras.optimizers.Adam(lr=self.config.lr_world, epsilon=1e-4) self.policy = PolicyNetwork(action_space=self.action_space) self.policy_optimizer = tf.keras.optimizers.Adam( lr=self.config.lr_actor, epsilon=1e-5) self.value = ValueNetwork(action_space=self.action_space) self.target_value = ValueNetwork(action_space=self.action_space) self.value_optimizer = tf.keras.optimizers.Adam( lr=self.config.lr_critic, epsilon=1e-5) self.setup() def setup(self): """ Build network weights """ env = gym.make(self.env_id) obs = self.preprocess_func(env.reset()) prev_z, prev_h = self.world_model.get_initial_state(batch_size=1) prev_a = tf.one_hot([0], self.action_space) _outputs = self.world_model(obs, prev_z, prev_h, prev_a) (h, z_prior, z_prior_prob, z_post, z_post_prob, feat, img_out, reward_pred, disc_logit) = _outputs self.policy(feat) self.value(feat) self.target_value(feat) self.target_value.set_weights(self.value.get_weights()) def save(self, savedir=None): savedir = Path(savedir) if savedir is not None else Path( "./checkpoints") self.world_model.save_weights(str(savedir / "worldmodel")) self.policy.save_weights(str(savedir / "policy")) self.value.save_weights(str(savedir / "critic")) def load(self, loaddir=None): loaddir = Path(loaddir) if loaddir is not None else Path("checkpoints") self.world_model.load_weights(str(loaddir / "worldmodel")) self.policy.load_weights(str(loaddir / "policy")) self.value.load_weights(str(loaddir / "critic")) self.target_value.load_weights(str(loaddir / "critic")) def set_weights(self, weights): wm_weights, policy_weights, value_weights = weights self.world_model.set_weights(wm_weights) self.policy.set_weights(policy_weights) self.value.set_weights(value_weights) self.target_value.set_weights(value_weights) def get_weights(self): weights = ( self.world_model.get_weights(), self.policy.get_weights(), self.value.get_weights(), ) return weights def rollout(self, weights=None): if weights: self.set_weights(weights) env = gym.make(self.env_id) obs = self.preprocess_func(env.reset()) episode_steps, episode_rewards = 0, 0 prev_z, prev_h = self.world_model.get_initial_state(batch_size=1) prev_a = tf.convert_to_tensor([[0] * self.action_space], dtype=tf.float32) done = False lives = int(env.ale.lives()) while not done: h = self.world_model.step_h(prev_z, prev_h, prev_a) feat, z = self.world_model.get_feature(obs, h) action = self.policy.sample_action(feat, self.epsilon) action_onehot = tf.one_hot([action], self.action_space) next_frame, reward, done, info = env.step(action) next_obs = self.preprocess_func(next_frame) #: Note: DreamerV2 paper uses tanh clipping _reward = reward if reward <= 1.0 else 1.0 #: Life loss as episode end if info["ale.lives"] != lives: _done = True lives = int(info["ale.lives"]) else: _done = done #: (r_t-1, done_t-1, obs_t, action_t, done_t) self.buffer.add(obs, action_onehot, _reward, next_obs, _done, prev_z, prev_h, prev_a) #: Update states obs = next_obs prev_z, prev_h, prev_a = z, h, action_onehot episode_steps += 1 episode_rewards += reward if episode_steps > 4000: _ = self.buffer.get_episode() return self.pid, [], 0, 0 sequences = self.buffer.get_sequences() return self.pid, sequences, episode_steps, episode_rewards def update_networks(self, minibatchs): for minibatch in minibatchs: z_posts, hs, info = self.update_worldmodel(minibatch) trajectory_in_dream = self.rollout_in_dream(z_posts, hs) info_ac = self.update_actor_critic(trajectory_in_dream) info.update(info_ac) return self.get_weights(), info def update_worldmodel(self, minibatch): """ Inputs: minibatch = { "obs": (L, B, 64, 64, 1) "action": (L, B, action_space) "reward": (L, B) "done": (L, B) "prev_z": (1, B, latent_dim * n_atoms) "prev_h": (1, B, 600) "prev_a": (1, B, action_space) } Note: 1. re-compute post and prior z by unrolling sequences from initial states, obs, prev_z, prev_h and prev_action 2. Conmpute KL loss (post_z, prior_z) 3. Reconstrunction loss, reward, discount loss """ (observations, actions, rewards, next_observations, dones, prev_z, prev_h, prev_a) = minibatch.values() discounts = (1. - dones) * self.config.gamma_discount prev_z, prev_h, prev_a = prev_z[0], prev_h[0], prev_a[0] last_obs = next_observations[-1][None, ...] observations = tf.concat([observations, last_obs], axis=0) #: dummy action to avoid IndexError at last iteration last_action = tf.zeros((1, ) + actions.shape[1:]) actions = tf.concat([actions, last_action], axis=0) L = self.config.sequence_length with tf.GradientTape() as tape: hs, z_prior_probs, z_posts, z_post_probs = [], [], [], [] img_outs, r_means, disc_logits = [], [], [] for t in tf.range(L + 1): _outputs = self.world_model(observations[t], prev_z, prev_h, prev_a) (h, z_prior, z_prior_prob, z_post, z_post_prob, feat, img_out, reward_mu, disc_logit) = _outputs hs.append(h) z_prior_probs.append(z_prior_prob) z_posts.append(z_post) z_post_probs.append(z_post_prob) img_outs.append(img_out) r_means.append(reward_mu) disc_logits.append(disc_logit) prev_z, prev_h, prev_a = z_post, h, actions[t] #: Reshape outputs #: [(B, ...), (B, ...), ...] -> (L+1, B, ...) -> (L, B, ...) hs = tf.stack(hs, axis=0)[:-1] z_prior_probs = tf.stack(z_prior_probs, axis=0)[:-1] z_posts = tf.stack(z_posts, axis=0)[:-1] z_post_probs = tf.stack(z_post_probs, axis=0)[:-1] img_outs = tf.stack(img_outs, axis=0)[:-1] r_means = tf.stack(r_means, axis=0)[1:] disc_logits = tf.stack(disc_logits, axis=0)[1:] #: Compute loss terms kl_loss = self._compute_kl_loss(z_prior_probs, z_post_probs) img_log_loss = self._compute_img_log_loss(observations[:-1], img_outs) reward_log_loss = self._compute_log_loss(rewards, r_means, mode="reward") discount_log_loss = self._compute_log_loss(discounts, disc_logits, mode="discount") loss = -img_log_loss - reward_log_loss - discount_log_loss + self.config.kl_scale * kl_loss loss *= 1. / L grads = tape.gradient(loss, self.world_model.trainable_variables) grads, norm = tf.clip_by_global_norm(grads, 100.) self.wm_optimizer.apply_gradients( zip(grads, self.world_model.trainable_variables)) info = { "wm_loss": L * loss, "img_log_loss": -img_log_loss, "reward_log_loss": -reward_log_loss, "discount_log_loss": -discount_log_loss, "kl_loss": kl_loss } return z_posts, hs, info @tf.function def _compute_kl_loss(self, post_probs, prior_probs): """ Compute KL divergence between two OnehotCategorical Distributions Notes: KL[ Q(z_post) || P(z_prior) ] Q(z_prior) := Q(z | h, o) P(z_prior) := P(z | h) Scratch Impl.: qlogq = post_probs * tf.math.log(post_probs) qlogp = post_probs * tf.math.log(prior_probs) kl_div = tf.reduce_sum(qlogq - qlogp, [1, 2]) Inputs: prior_probs (L, B, latent_dim, n_atoms) post_probs (L, B, latent_dim, n_atoms) """ #: Add small value to prevent inf kl post_probs += 1e-5 prior_probs += 1e-5 #: KL Balancing: See 2.2 BEHAVIOR LEARNING Algorithm 2 kl_div1 = tfd.kl_divergence( tfd.Independent( tfd.OneHotCategorical(probs=tf.stop_gradient(post_probs)), reinterpreted_batch_ndims=1), tfd.Independent(tfd.OneHotCategorical(probs=prior_probs), reinterpreted_batch_ndims=1)) kl_div2 = tfd.kl_divergence( tfd.Independent(tfd.OneHotCategorical(probs=post_probs), reinterpreted_batch_ndims=1), tfd.Independent( tfd.OneHotCategorical(probs=tf.stop_gradient(prior_probs)), reinterpreted_batch_ndims=1)) alpha = self.config.kl_alpha kl_loss = alpha * kl_div1 + (1. - alpha) * kl_div2 #: Batch mean kl_loss = tf.reduce_mean(kl_loss) return kl_loss @tf.function def _compute_img_log_loss(self, img_in, img_out): """ Inputs: img_in: (L, B, 64, 64, 1) img_out: (L, B, 64, 64, 1) """ L, B, H, W, C = img_in.shape img_in = tf.reshape(img_in, (L * B, H * W * C)) img_out = tf.reshape(img_out, (L * B, H * W * C)) dist = tfd.Independent(tfd.Normal(loc=img_out, scale=1.)) #dist = tfd.Independent(tfd.Bernoulli(logits=img_out)) log_prob = dist.log_prob(img_in) loss = tf.reduce_mean(log_prob) return loss @tf.function def _compute_log_loss(self, y_true, y_pred, mode): """ Inputs: y_true: (L, B, 1) y_pred: (L, B, 1) mode: "reward" or "discount" """ if mode == "discount": dist = tfd.Independent(tfd.Bernoulli(logits=y_pred), reinterpreted_batch_ndims=1) elif mode == "reward": dist = tfd.Independent(tfd.Normal(loc=y_pred, scale=1.), reinterpreted_batch_ndims=1) log_prob = dist.log_prob(y_true) loss = tf.reduce_mean(log_prob) return loss def rollout_in_dream(self, z_init, h_init, video=False): """ Inputs: h_init: (L, B, 1) z_init: (L, B, latent_dim * n_atoms) done_init: (L, B, 1) """ L, B = h_init.shape[:2] horizon = self.config.imagination_horizon z, h = tf.reshape(z_init, [L * B, -1]), tf.reshape(h_init, [L * B, -1]) feats = tf.concat([z, h], axis=-1) #: s_t, a_t, s_t+1 trajectory = {"state": [], "action": [], 'next_state': []} for t in range(horizon): actions = tf.cast(self.policy.sample(feats), dtype=tf.float32) trajectory["state"].append(feats) trajectory["action"].append(actions) h = self.world_model.step_h(z, h, actions) z, _ = self.world_model.rssm.sample_z_prior(h) z = tf.reshape(z, [L * B, -1]) feats = tf.concat([z, h], axis=-1) trajectory["next_state"].append(feats) trajectory = {k: tf.stack(v, axis=0) for k, v in trajectory.items()} #: reward_head(s_t+1) -> r_t #: Distribution.mode()は確立最大値を返すのでNormalの場合は #: trjactory["reward"] == rewards rewards = self.world_model.reward_head(trajectory['next_state']) trajectory["reward"] = rewards disc_logits = self.world_model.discount_head(trajectory['next_state']) trajectory["discount"] = tfd.Independent( tfd.Bernoulli(logits=disc_logits), reinterpreted_batch_ndims=1).mean() return trajectory def update_actor_critic(self, trajectory, batch_size=512, strategy="PPO"): """ Actor-Critic update using PPO & Generalized Advantage Estimator """ #: adv: (L*B, 1) targets, weights = self.compute_target(trajectory['state'], trajectory['reward'], trajectory['next_state'], trajectory['discount']) #: (H, L*B, ...) states = trajectory['state'] selected_actions = trajectory['action'] N = weights.shape[0] * weights.shape[1] states = tf.reshape(states, [N, -1]) selected_actions = tf.reshape(selected_actions, [N, -1]) targets = tf.reshape(targets, [N, -1]) weights = tf.reshape(weights, [N, -1]) _, old_action_probs = self.policy(states) old_logprobs = tf.math.log(old_action_probs + 1e-5) for _ in range(10): indices = np.random.choice(N, batch_size) _states = tf.gather(states, indices) _targets = tf.gather(targets, indices) _selected_actions = tf.gather(selected_actions, indices) _old_logprobs = tf.gather(old_logprobs, indices) _weights = tf.gather(weights, indices) #: Update value network with tf.GradientTape() as tape1: v_pred = self.value(_states) advantages = _targets - v_pred value_loss = 0.5 * tf.square(advantages) discount_value_loss = tf.reduce_mean(value_loss * _weights) grads = tape1.gradient(discount_value_loss, self.value.trainable_variables) self.value_optimizer.apply_gradients( zip(grads, self.value.trainable_variables)) #: Update policy network if strategy == "VanillaPG": with tf.GradientTape() as tape2: _, action_probs = self.policy(_states) action_probs += 1e-5 selected_action_logprobs = tf.reduce_sum( _selected_actions * tf.math.log(action_probs), axis=1, keepdims=True) objective = selected_action_logprobs * advantages dist = tfd.Independent( tfd.OneHotCategorical(probs=action_probs), reinterpreted_batch_ndims=0) ent = dist.entropy() policy_loss = objective + self.config.ent_scale * ent[..., None] policy_loss *= -1 discounted_policy_loss = tf.reduce_mean(policy_loss * _weights) elif strategy == "PPO": with tf.GradientTape() as tape2: _, action_probs = self.policy(_states) action_probs += 1e-5 new_logprobs = tf.math.log(action_probs) ratio = tf.reduce_sum(_selected_actions * tf.exp(new_logprobs - _old_logprobs), axis=1, keepdims=True) ratio_clipped = tf.clip_by_value(ratio, 0.9, 1.1) obj_unclipped = ratio * advantages obj_clipped = ratio_clipped * advantages objective = tf.minimum(obj_unclipped, obj_clipped) dist = tfd.Independent( tfd.OneHotCategorical(probs=action_probs), reinterpreted_batch_ndims=0) ent = dist.entropy() policy_loss = objective + self.config.ent_scale * ent[..., None] policy_loss *= -1 discounted_policy_loss = tf.reduce_mean(policy_loss * _weights) grads = tape2.gradient(discounted_policy_loss, self.policy.trainable_variables) self.policy_optimizer.apply_gradients( zip(grads, self.policy.trainable_variables)) info = { "policy_loss": tf.reduce_mean(policy_loss), "objective": tf.reduce_mean(objective), "actor_entropy": tf.reduce_mean(ent), "value_loss": tf.reduce_mean(value_loss), "target_0": tf.reduce_mean(_targets), } return info def compute_target(self, states, rewards, next_states, discounts, strategy="mixed-multistep"): T, B, F = states.shape v_next = self.target_value(next_states) _weights = tf.concat([tf.ones_like(discounts[:1]), discounts[:-1]], axis=0) weights = tf.math.cumprod(_weights, axis=0) if strategy == "gae": """ HIGH-DIMENSIONAL CONTINUOUS CONTROL USING GENERALIZED ADVANTAGE ESTIMATION https://arxiv.org/pdf/1506.02438.pdf """ raise NotImplementedError() #lambda_ = self.config.lambda_gae #deltas = rewards + discounts * v_next - v #_weights = tf.concat( # [tf.ones_like(discounts[:1]), discounts[:-1] * lambda_], # axis=0) #weights = tf.math.cumprod(_weights, axis=0) #advantage = tf.reduce_sum(weights * deltas, axis=0) #v_target = advantage + v[0] elif strategy == "mixed-multistep": targets = np.zeros_like(v_next) #: (H, L*B, 1) last_value = v_next[-1] for i in reversed(range(targets.shape[0])): last_value = rewards[i] + discounts[i] * last_value targets[i] = last_value else: raise NotImplementedError() return targets, weights def testplay(self, test_id, video_dir: Path = None, weights=None): if weights: self.set_weights(weights) images = [] env = gym.make(self.env_id) obs = self.preprocess_func(env.reset()) episode_steps, episode_rewards = 0, 0 r_pred_total = 0. prev_z, prev_h = self.world_model.get_initial_state(batch_size=1) prev_a = tf.convert_to_tensor([[0] * self.action_space], dtype=tf.float32) done = False while not done: (h, z_prior, z_prior_probs, z_post, z_post_probs, feat, img_out, r_pred, discount_logit) = self.world_model(obs, prev_z, prev_h, prev_a) action = self.policy.sample_action(feat, 0) action_onehot = tf.one_hot([action], self.action_space) next_frame, reward, done, info = env.step(action) next_obs = self.preprocess_func(next_frame) #img_out = tfd.Independent(tfd.Bernoulli(logits=img_out), 3).mean() disc = tfd.Bernoulli(logits=discount_logit).mean() r_pred_total += float(r_pred) img = util.vizualize_vae(obs[0, :, :, 0], img_out.numpy()[0, :, :, 0], float(r_pred), float(disc), r_pred_total) images.append(img) #: Update states obs = next_obs prev_z, prev_h, prev_a = z_post, h, action_onehot episode_steps += 1 episode_rewards += reward #: avoiding agent freeze if episode_steps > 300 and episode_rewards < 2: break elif episode_steps > 1000 and episode_rewards < 10: break elif episode_steps > 4000: break if video_dir is not None: images[0].save(f'{video_dir}/testplay_{test_id}.gif', save_all=True, append_images=images[1:], optimize=False, duration=120, loop=0) return episode_steps, episode_rewards def testplay_in_dream(self, test_id, outdir: Path, H, weights=None): if weights: self.set_weights(weights) img_outs = [] prev_z, prev_h = self.world_model.get_initial_state(batch_size=1) prev_a = tf.convert_to_tensor([[0] * self.action_space], dtype=tf.float32) actions, rewards, discounts = [], [], [] env = gym.make(self.env_id) obs = self.preprocess_func(env.reset()) N = random.randint(2, 10) for i in range(N + H + 1): if i < N: (h, z_prior, z_prior_probs, z_post, z_post_probs, feat, img_out, r_pred, disc_logit) = self.world_model(obs, prev_z, prev_h, prev_a) discount_pred = tfd.Bernoulli(logits=disc_logit).mean() img_out = obs[0, :, :, 0] action = 1 if i == 0 else self.policy.sample_action(feat, 0) next_frame, reward, done, info = env.step(action) obs = self.preprocess_func(next_frame) z = z_post else: h = self.world_model.step_h(prev_z, prev_h, prev_a) z, _ = self.world_model.rssm.sample_z_prior(h) z = tf.reshape(z, [1, -1]) feat = tf.concat([z, h], axis=-1) img_out = self.world_model.decoder(feat) #img_out = tfd.Independent(tfd.Bernoulli(logits=img_out), 3).mean() img_out = img_out.numpy()[0, :, :, 0] r_pred = self.world_model.reward_head(feat) disc_logit = self.world_model.discount_head(feat) discount_pred = tfd.Bernoulli(logits=disc_logit).mean() action = self.policy.sample_action(feat, 0) actions.append(int(action)) rewards.append(float(r_pred)) discounts.append(float(discount_pred)) img_outs.append(img_out) action_onehot = tf.one_hot([action], self.action_space) prev_z, prev_h, prev_a = z, h, action_onehot img_outs, actions, rewards, discounts = img_outs[: -1], actions[:-1], rewards[ 1:], discounts[1:] images = util.visualize_dream(img_outs, actions, rewards, discounts) images[0].save(f'{outdir}/test_in_dream_{test_id}.gif', save_all=True, append_images=images[1:], optimize=False, duration=1000, loop=0)
class Agent: def __init__(self, alpha=3e-4, beta=3e-4, input_dims=[8], env=None, gamma=0.99, n_actions=2, max_size=1000000, tau=5e-3, fc1_dim=256, fc2_dim=256, batch_size=256, reward_scale=2): self.gamma = gamma self.tau = tau self.memory = ReplayBuffer(max_size, input_dims, n_actions) self.batch_size = batch_size self.n_actions = n_actions self.actor = ActorNetwork(alpha, input_dims, n_actions, env.action_space.high) self.critic1 = CriticNetwork(beta, input_dims, n_actions, name='critic1') self.critic2 = CriticNetwork(beta, input_dims, n_actions, name='critic2') self.value = ValueNetwork(beta, input_dims, name='value') self.target_value = ValueNetwork(beta, input_dims, name='target_value') self.scale = reward_scale self.update_network_parameters(tau=1) def choose_action(self, obs): state = T.Tensor([obs]).to(self.actor.device) actions, _ = self.actor.sample_normal(state, reparameterize=False) return actions.cpu().detach().numpy()[0] def remember(self, state, n_state, action, reward, done): self.memory.store_transition(state, n_state, action, reward, done) def update_network_parameters(self, tau=None): if tau is None: tau = self.tau trg_value_params = self.target_value.named_parameters() value_params = self.value.named_parameters() trg_value_state_dict = dict(trg_value_params) value_state_dict = dict(value_params) for name in value_state_dict: value_state_dict[name] = tau*value_state_dict[name].clone() + \ (1-tau)*trg_value_state_dict[name].clone() self.target_value.load_state_dict(value_state_dict) def save_models(self): print('... saving models ...') self.actor.save_ckpt() self.value.save_ckpt() self.target_value.save_ckpt() self.critic1.save_ckpt() self.critic2.save_ckpt() def load_models(self): print('... loading models ...') self.actor.load_ckpt() self.value.load_ckpt() self.target_value.load_ckpt() self.critic1.load_ckpt() self.critic2.load_ckpt() def learn(self): if self.memory.mem_ptr < self.batch_size: return s, ns, a, r, t = \ self.memory.sample_buffer(self.batch_size) s = T.Tensor(s).to(self.actor.device) ns = T.Tensor(ns).to(self.actor.device) a = T.Tensor(a).to(self.actor.device) r = T.Tensor(r).to(self.actor.device) t = T.tensor(t).to(self.actor.device) # update value net value = self.value(s).view(-1) value_ = self.target_value(ns).view(-1) value_[t] = 0.0 actions, logprobs = self.actor.sample_normal(s, reparameterize=False) logprobs = logprobs.view(-1) critic_value = T.min(self.critic1(s, actions), self.critic2(s, actions)) critic_value = critic_value.view(-1) value_target = critic_value - logprobs value_loss = 0.5 * F.mse_loss(value, value_target) self.value.optimizer.zero_grad() value_loss.backward(retain_graph=True) self.value.optimizer.step() # update actor net actions, logprobs = self.actor.sample_normal(s, reparameterize=True) logprobs = logprobs.view(-1) critic_value = T.min(self.critic1(s, actions), self.critic2(s, actions)) critic_value = critic_value.view(-1) actor_loss = T.mean(logprobs - critic_value) self.actor.optimizer.zero_grad() actor_loss.backward(retain_graph=True) self.actor.optimizer.step() # update critic net q_hat = self.scale * r + self.gamma * value_ q1 = self.critic1(s, a).view(-1) q2 = self.critic2(s, a).view(-1) critic1_loss = 0.5 * F.mse_loss(q_hat, q1) critic2_loss = 0.5 * F.mse_loss(q_hat, q2) critic_loss = critic1_loss + critic2_loss self.critic1.optimizer.zero_grad() self.critic2.optimizer.zero_grad() critic_loss.backward() self.critic1.optimizer.step() self.critic2.optimizer.step() self.update_network_parameters()
class Agent(): def __init__(self, alpha=0.0003, beta=0.0003, input_dims=[8], env=None, gamma=0.99, n_actions=2, max_size=1000000, tau=0.005, layer1_size=256, layer2_size=256, batch_size=256, reward_scale=2): self.gamma = gamma self.tau = tau self.memory = ReplayBuffer(max_size, input_dims, n_actions) self.batch_size = batch_size self.n_actions = n_actions self.actor = ActorNetwork(alpha, input_dims, n_actions=n_actions, name='actor', max_action=env.action_space.high) self.critic_1 = CriticNetwork(beta, input_dims, n_actions=n_actions, name='critic_1') self.critic_2 = CriticNetwork(beta, input_dims, n_actions=n_actions, name='critic_2') self.value = ValueNetwork(beta, input_dims, name='value') self.target_value = ValueNetwork(beta, input_dims, name='target_value') self.scale = reward_scale self.update_network_parameters(tau=1) #sets the parameters of Target-network equals to the #values of Target-network in the beginning def choose_action(self, observation): state = T.Tensor([observation]).to(self.actor.device) actions, _ = self.actor.sample_normal(state, reparameterize=False) return actions.cpu().detach().numpy()[0] def remember(self, state, action, reward, new_state, done): self.memory.store_transition(state, action, reward, new_state, done) def update_network_parameters(self, tau=None): if tau is None: #different copies: exact VS soft tau = self.tau target_value_params = self.target_value.named_parameters() value_params = self.value.named_parameters() target_value_state_dict = dict(target_value_params) value_state_dict = dict(value_params) for name in value_state_dict: value_state_dict[name] = tau * value_state_dict[name].clone() + \ (1 - tau) * target_value_state_dict[name].clone() self.target_value.load_state_dict(value_state_dict) def save_models(self): print('.... saving models ....') self.actor.save_checkpoint() self.value.save_checkpoint() self.target_value.save_checkpoint() self.critic_1.save_checkpoint() self.critic_2.save_checkpoint() def load_models(self): print('.... loading models ....') self.actor.load_checkpoint() self.value.load_checkpoint() self.target_value.load_checkpoint() self.critic_1.load_checkpoint() self.critic_2.load_checkpoint() def learn(self): if self.memory.mem_cntr < self.batch_size: return state, action, reward, new_state, done = \ #takes the batch self.memory.sample_buffer(self.batch_size) reward = T.tensor(reward, dtype=T.float).to(self.actor.device) #trasforms into tensors done = T.tensor(done).to(self.actor.device) state_ = T.tensor(new_state, dtype=T.float).to(self.actor.device) state = T.tensor(state, dtype=T.float).to(self.actor.device) action = T.tensor(action, dtype=T.float).to(self.actor.device) value = self.value(state).view(-1) value_ = self.target_value(state_).view(-1) value_[done] = 0.0 #####_ sta usando 0 per dire True? @15, 17 actions, log_probs = self.actor.sample_normal(state, reparameterize=False) #takes the lower Q-values from 2 Critics to the Critic log_probs = log_probs.view(-1) q1_new_policy = self.critic_1.forward(state, actions) q2_new_policy = self.critic_2.forward(state, actions) critic_value = T.min(q1_new_policy, q2_new_policy) critic_value = critic_value.view(-1) self.value.optimizer.zero_grad() value_target = critic_value - log_probs ####_ Perchè non prende semplicemente il critic_value? value_loss = 0.5 * F.mse_loss(value, value_target) value_loss.backward(retain_graph=True) self.value.optimizer.step() actions, log_probs = self.actor.sample_normal(state, reparameterize=True) log_probs = log_probs.view(-1) q1_new_policy = self.critic_1.forward(state, actions) q2_new_policy = self.critic_2.forward(state, actions) critic_value = T.min(q1_new_policy, q2_new_policy) critic_value = critic_value.view(-1) actor_loss = log_probs - critic_value actor_loss = T.mean(actor_loss) self.actor.optimizer.zero_grad() actor_loss.backward(retain_graph=True) self.actor.optimizer.step() self.critic_1.optimizer.zero_grad() self.critic_2.optimizer.zero_grad() q_hat = self.scale * reward + self.gamma * value_ #The scaling factor takes into account the entropy and encourage exploration q1_old_policy = self.critic_1.forward(state, action).view(-1) q2_old_policy = self.critic_2.forward(state, action).view(-1) critic_1_loss = 0.5 * F.mse_loss(q1_old_policy, q_hat) critic_2_loss = 0.5 * F.mse_loss(q2_old_policy, q_hat) critic_loss = critic_1_loss + critic_2_loss critic_loss.backward() self.critic_1.optimizer.step() self.critic_2.optimizer.step() self.update_network_parameters()
class Agent: def __init__(self, alpha=0.0003, beta=0.0003, input_dims=[8], env=None, gamma=0.99, n_actions=2, max_size=1000000, tau=0.005, layer1_size=256, layer2_size=256, batch_size=256, reward_scale=2): self.gamma = gamma self.tau = tau self.memory = ReplayBuffer(max_size, input_dims, n_actions) self.batch_size = batch_size self.n_actions = n_actions self.actor = ActorNetwork(n_actions=n_actions, name='actor', max_action=env.action_space.high) self.critic_1 = CriticNetwork(n_actions=n_actions, name='critic_1') self.critic_2 = CriticNetwork(n_actions=n_actions, name='critic_2') self.value = ValueNetwork(name='value') self.target_value = ValueNetwork(name='target_value') self.actor.compile(optimizer=Adam(learning_rate=alpha)) self.critic_1.compile(optimizer=Adam(learning_rate=beta)) self.critic_2.compile(optimizer=Adam(learning_rate=beta)) self.value.compile(optimizer=Adam(learning_rate=beta)) self.target_value.compile(optimizer=Adam(learning_rate=beta)) self.scale = reward_scale self.update_network_parameters(tau=1) def choose_action(self, observation): state = tf.convert_to_tensor([observation]) actions, _ = self.actor.sample_normal(state, reparameterize=False) return actions[0] def remember(self, state, action, reward, new_state, done): self.memory.store_transition(state, action, reward, new_state, done) def update_network_parameters(self, tau=None): if tau is None: tau = self.tau weights = [] targets = self.target_value.weights for i, weight in enumerate(self.value.weights): weights.append(weight * tau + targets[i] * (1 - tau)) self.target_value.set_weights(weights) def save_models(self): print('... saving models ...') self.actor.save_weights(self.actor.checkpoint_file) self.critic_1.save_weights(self.critic_1.checkpoint_file) self.critic_2.save_weights(self.critic_2.checkpoint_file) self.value.save_weights(self.value.checkpoint_file) self.target_value.save_weights(self.target_value.checkpoint_file) def load_models(self): print('... loading models ...') self.actor.load_weights(self.actor.checkpoint_file) self.critic_1.load_weights(self.critic_1.checkpoint_file) self.critic_2.load_weights(self.critic_2.checkpoint_file) self.value.load_weights(self.value.checkpoint_file) self.target_value.load_weights(self.target_value.checkpoint_file) def learn(self): if self.memory.mem_cntr < self.batch_size: return state, action, reward, new_state, done = \ self.memory.sample_buffer(self.batch_size) states = tf.convert_to_tensor(state, dtype=tf.float32) states_ = tf.convert_to_tensor(new_state, dtype=tf.float32) rewards = tf.convert_to_tensor(reward, dtype=tf.float32) actions = tf.convert_to_tensor(action, dtype=tf.float32) with tf.GradientTape() as tape: value = tf.squeeze(self.value(states), 1) value_ = tf.squeeze(self.target_value(states_), 1) current_policy_actions, log_probs = self.actor.sample_normal( states, reparameterize=False) log_probs = tf.squeeze(log_probs, 1) q1_new_policy = self.critic_1(states, current_policy_actions) q2_new_policy = self.critic_2(states, current_policy_actions) critic_value = tf.squeeze( tf.math.minimum(q1_new_policy, q2_new_policy), 1) value_target = critic_value - log_probs value_loss = 0.5 * keras.losses.MSE(value, value_target) value_network_gradient = tape.gradient(value_loss, self.value.trainable_variables) self.value.optimizer.apply_gradients( zip(value_network_gradient, self.value.trainable_variables)) with tf.GradientTape() as tape: # in the original paper, they reparameterize here. We don't implement # this so it's just the usual action. new_policy_actions, log_probs = self.actor.sample_normal( states, reparameterize=True) log_probs = tf.squeeze(log_probs, 1) q1_new_policy = self.critic_1(states, new_policy_actions) q2_new_policy = self.critic_2(states, new_policy_actions) critic_value = tf.squeeze( tf.math.minimum(q1_new_policy, q2_new_policy), 1) actor_loss = log_probs - critic_value actor_loss = tf.math.reduce_mean(actor_loss) actor_network_gradient = tape.gradient(actor_loss, self.actor.trainable_variables) self.actor.optimizer.apply_gradients( zip(actor_network_gradient, self.actor.trainable_variables)) with tf.GradientTape(persistent=True) as tape: # I didn't know that these context managers shared values? q_hat = self.scale * reward + self.gamma * value_ * (1 - done) q1_old_policy = tf.squeeze(self.critic_1(state, action), 1) q2_old_policy = tf.squeeze(self.critic_2(state, action), 1) critic_1_loss = 0.5 * keras.losses.MSE(q1_old_policy, q_hat) critic_2_loss = 0.5 * keras.losses.MSE(q2_old_policy, q_hat) critic_1_network_gradient = tape.gradient( critic_1_loss, self.critic_1.trainable_variables) critic_2_network_gradient = tape.gradient( critic_2_loss, self.critic_2.trainable_variables) self.critic_1.optimizer.apply_gradients( zip(critic_1_network_gradient, self.critic_1.trainable_variables)) self.critic_2.optimizer.apply_gradients( zip(critic_2_network_gradient, self.critic_2.trainable_variables)) self.update_network_parameters()
class Agent(): def __init__(self, input_dims, env, n_actions): self.memory = ReplayBuffer(input_dims) self.n_actions = n_actions self.actor_nn = ActorNetwork(input_dims, n_actions=n_actions, name=Constants.env_id + '_actor', max_action=env.action_space.n) self.critic_local_1_nn = CriticNetwork(input_dims, n_actions=n_actions, name=Constants.env_id + '_critic_local_1') self.critic_local_2_nn = CriticNetwork(input_dims, n_actions=n_actions, name=Constants.env_id + '_critic_local_2') self.critic_target_1_nn = CriticNetwork(input_dims, n_actions=n_actions, name=Constants.env_id + '_critic_target_1') self.critic_target_2_nn = CriticNetwork(input_dims, n_actions=n_actions, name=Constants.env_id + '_critic_target_2') self.value_nn = ValueNetwork(input_dims, name=Constants.env_id + '_value') self.target_value_nn = ValueNetwork(input_dims, name=Constants.env_id + '_target_value') self.update_network_parameters(tau=1) def choose_action(self, observation): state = T.Tensor([observation]).to(Constants.device) _, max_probability_action = self.actor_nn.sample_action(state) return max_probability_action def remember(self, state, action, reward, new_state, done): self.memory.store_transition(state, action, reward, new_state, done) def learn(self): if self.memory.mem_cntr < Hyper.batch_size: return state, action, reward, next_state, done = self.memory.sample_buffer() reward = T.tensor(reward, dtype=T.float).to(Constants.device) done = T.tensor(done).to(Constants.device) next_state = T.tensor(next_state, dtype=T.float).to(Constants.device) state = T.tensor(state, dtype=T.float).to(Constants.device) action = T.tensor(action, dtype=T.float).to(Constants.device) # value_from_nn = self.value_nn(state).view(-1) value_from_nn = self.value_nn(state) new_value_from_nn = self.target_value_nn(next_state).view(-1) new_value_from_nn[done] = 0.0 (action_probabilities, log_action_probabilities), _ = self.actor_nn.sample_action(next_state) with T.no_grad(): q1_new_policy = self.critic_target_1_nn(next_state) q2_new_policy = self.critic_target_2_nn(next_state) critic_value = T.min(q1_new_policy, q2_new_policy) self.value_nn.optimizer.zero_grad() # CHANGE0003 Soft state-value where actions are discrete inside_term = Hyper.alpha * log_action_probabilities - critic_value #value_target = action_probabilities * (critic_value - Hyper.alpha * log_action_probabilities) value_loss = (action_probabilities * inside_term).sum(dim=1).mean() value_loss.backward(retain_graph=True) self.value_nn.optimizer.step() (action_probabilities, log_action_probabilities), _ = self.actor_nn.sample_action(state) with T.no_grad(): q1_new_policy = self.critic_local_1_nn(state) q2_new_policy = self.critic_local_1_nn(state) critic_value = T.min(q1_new_policy, q2_new_policy) # CHANGE0005 Objective for policy actor_loss = action_probabilities * ( Hyper.alpha * log_action_probabilities - critic_value) actor_loss = T.mean(actor_loss) self.actor_nn.optimizer.zero_grad() actor_loss.backward(retain_graph=True) self.actor_nn.optimizer.step() self.critic_local_1_nn.optimizer.zero_grad() self.critic_local_2_nn.optimizer.zero_grad() q_hat = Hyper.reward_scale * reward + Hyper.gamma * new_value_from_nn action_logits1 = self.critic_local_1_nn(state) q1_old_policy = T.argmax(action_logits1, dim=1, keepdim=True).view(-1) action_logits2 = self.critic_local_2_nn(state) q2_old_policy = T.argmax(action_logits2, dim=1, keepdim=True).view(-1) critic_1_loss = 0.5 * F.mse_loss(q1_old_policy, q_hat) critic_2_loss = 0.5 * F.mse_loss(q2_old_policy, q_hat) critic_loss = critic_1_loss + critic_2_loss critic_loss.backward() self.critic_local_1_nn.optimizer.step() self.critic_local_2_nn.optimizer.step() self.update_network_parameters() def update_network_parameters(self, tau=None): if tau is None: tau = Hyper.tau target_value_params = self.target_value_nn.named_parameters() value_params = self.value_nn.named_parameters() target_value_state_dict = dict(target_value_params) value_state_dict = dict(value_params) for name in value_state_dict: value_state_dict[name] = tau*value_state_dict[name].clone() + \ (1-tau)*target_value_state_dict[name].clone() self.target_value_nn.load_state_dict(value_state_dict) self.update_network_parameters_line( self.critic_target_1_nn.named_parameters(), self.critic_local_1_nn.named_parameters(), tau) self.update_network_parameters_line( self.critic_target_2_nn.named_parameters(), self.critic_local_2_nn.named_parameters(), tau) def update_network_parameters_line(self, target_params, local_params, tau): for target_param, local_param in zip(target_params, local_params): target_param[1].data.copy_(tau * local_param[1].data + (1.0 - tau) * target_param[1].data) def save_models(self): print('.... saving models ....') self.actor_nn.save_checkpoint() self.value_nn.save_checkpoint() self.target_value_nn.save_checkpoint() self.critic_local_1_nn.save_checkpoint() self.critic_local_2_nn.save_checkpoint() self.critic_target_1_nn.save_checkpoint() self.critic_target_2_nn.save_checkpoint() def load_models(self): print('.... loading models ....') self.actor_nn.load_checkpoint() self.value_nn.load_checkpoint() self.target_value_nn.load_checkpoint() self.critic_local_1_nn.load_checkpoint() self.critic_local_2_nn.load_checkpoint() self.critic_target_1_nn.load_checkpoint() self.critic_target_2_nn.load_checkpoint()