def kldivergence(self, datas1, datas2): mean1, std1 = datas1 mean2, std2 = datas2 distribution1 = Normal(mean1, std1) distribution2 = Normal(mean2, std2) return kl_divergence(distribution1, distribution2).float().to(set_device(self.use_gpu))
def __init__(self, state_dim, action_dim, use_gpu=True): super(PolicyModel, self).__init__() self.std = torch.FloatTensor([1.0, 0.5, 0.5]).to(set_device(use_gpu)) self.state_extractor = nn.Sequential(nn.Linear(1, 32), nn.ReLU()) self.nn_layer = nn.Sequential(nn.Linear(160, 320), nn.ReLU(), nn.Linear(320, 128), nn.ReLU()) self.critic_layer = nn.Sequential(nn.Linear(128, 1)) self.actor_tanh_layer = nn.Sequential(nn.Linear(128, 1), nn.Tanh()) self.actor_sigmoid_layer = nn.Sequential(nn.Linear(128, 2), nn.Sigmoid())
def __init__(self, Policy_Model, Value_Model, CnnModel, ProjectionModel, state_dim, action_dim, policy_dist, policy_loss, auxppg_loss, auxclr_loss, policy_memory, auxppg_memory, auxclr_memory, ppo_epochs=10, auxppg_epochs=10, auxclr_epochs=10, n_aux_update=2, is_training_mode=True, policy_kl_range=0.03, policy_params=5, value_clip=1.0, entropy_coef=0.0, vf_loss_coef=1.0, batch_size=32, learning_rate=3e-4, folder='model', use_gpu=True): self.policy_kl_range = policy_kl_range self.policy_params = policy_params self.value_clip = value_clip self.entropy_coef = entropy_coef self.vf_loss_coef = vf_loss_coef self.batch_size = batch_size self.ppo_epochs = ppo_epochs self.auxppg_epochs = auxppg_epochs self.auxclr_epochs = auxclr_epochs self.is_training_mode = is_training_mode self.action_dim = action_dim self.state_dim = state_dim self.learning_rate = learning_rate self.folder = folder self.use_gpu = use_gpu self.n_aux_update = n_aux_update self.device = set_device(self.use_gpu) self.policy = Policy_Model(state_dim, action_dim, self.use_gpu).float().to(self.device) self.policy_old = Policy_Model(state_dim, action_dim, self.use_gpu).float().to(self.device) self.value = Value_Model(state_dim).float().to(self.device) self.value_old = Value_Model(state_dim).float().to(self.device) self.cnn = CnnModel().float().to(self.device) self.auxclr_projection = ProjectionModel().float().to(self.device) self.policy_dist = policy_dist self.policy_memory = policy_memory self.auxppg_memory = auxppg_memory self.auxclr_memory = auxclr_memory self.policyLoss = policy_loss self.auxppgLoss = auxppg_loss self.auxclrLoss = auxclr_loss self.i_auxppg_update = 0 self.i_ppo_update = 0 self.ppo_optimizer = Adam(list(self.policy.parameters()) + list(self.value.parameters()) + list(self.cnn.parameters()), lr=learning_rate) self.auxppg_optimizer = Adam(list(self.policy.parameters()), lr=learning_rate) self.auxclr_optimizer = Adam(list(self.cnn.parameters()) + list(self.auxclr_projection.parameters()), lr=learning_rate) self.ppo_scaler = torch.cuda.amp.GradScaler() self.auxppg_scaler = torch.cuda.amp.GradScaler() self.auxclr_scaler = torch.cuda.amp.GradScaler() self.policy_old.load_state_dict(self.policy.state_dict()) self.value_old.load_state_dict(self.value.state_dict()) self.trans = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) ]) if is_training_mode: self.policy.train() self.value.train() else: self.policy.eval() self.value.eval()
def compute_loss(self, first_encoded, second_encoded): indexes = torch.arange(first_encoded.shape[0]).long().to(set_device(self.use_gpu)) similarity = torch.nn.functional.cosine_similarity(first_encoded.unsqueeze(1), second_encoded.unsqueeze(0), dim = 2) return torch.nn.functional.cross_entropy(similarity, indexes)
def __init__(self, Policy_Model, Value_Model, Q_Model, CnnModel, ProjectionModel, state_dim, action_dim, policy_dist, q_loss, v_loss, policy_loss, auxclr_loss, policy_memory, auxclr_memory, is_training_mode=True, batch_size=32, cql_epochs=4, auxclr_epochs=4, soft_tau=0.95, learning_rate=3e-4, folder='model', use_gpu=True): self.batch_size = batch_size self.is_training_mode = is_training_mode self.action_dim = action_dim self.state_dim = state_dim self.learning_rate = learning_rate self.folder = folder self.use_gpu = use_gpu self.cql_epochs = cql_epochs self.auxclr_epochs = auxclr_epochs self.soft_tau = soft_tau self.device = set_device(self.use_gpu) self.soft_q1 = Q_Model(state_dim, action_dim, self.use_gpu).float().to(self.device) self.soft_q2 = Q_Model(state_dim, action_dim, self.use_gpu).float().to(self.device) self.value = Value_Model(state_dim, self.use_gpu).float().to(self.device) self.policy = Policy_Model(state_dim, action_dim, self.use_gpu).float().to(self.device) self.cnn = CnnModel().float().to(self.device) self.auxclr_projection = ProjectionModel().float().to(self.device) self.policy_dist = policy_dist self.policy_memory = policy_memory self.auxclr_memory = auxclr_memory self.qLoss = q_loss self.vLoss = v_loss self.policyLoss = policy_loss self.auxclrLoss = auxclr_loss self.soft_q_optimizer = Adam(list(self.soft_q1.parameters()) + list(self.soft_q2.parameters()), lr=learning_rate) self.auxclr_optimizer = Adam(list(self.cnn.parameters()) + list(self.auxclr_projection.parameters()), lr=learning_rate) self.value_optimizer = Adam(self.value.parameters(), lr=learning_rate) self.policy_optimizer = Adam(self.policy.parameters(), lr=learning_rate) self.soft_q_scaler = torch.cuda.amp.GradScaler() self.value_scaler = torch.cuda.amp.GradScaler() self.policy_scaler = torch.cuda.amp.GradScaler() self.auxclr_scaler = torch.cuda.amp.GradScaler() self.trans = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) ])
def sample(self, datas): mean, std = datas distribution = Normal(0, 1) rand = distribution.sample().float().to(set_device(self.use_gpu)) return (mean + std * rand).squeeze(0)
def logprob(self, datas, value_data): mean, std = datas distribution = Normal(mean, std) return distribution.log_prob(value_data).float().to(set_device(self.use_gpu))
def entropy(self, datas): mean, std = datas distribution = Normal(mean, std) return distribution.entropy().float().to(set_device(self.use_gpu))