示例#1
0
    def kldivergence(self, datas1, datas2):
        mean1, std1 = datas1
        mean2, std2 = datas2

        distribution1 = Normal(mean1, std1)
        distribution2 = Normal(mean2, std2)
        return kl_divergence(distribution1, distribution2).float().to(set_device(self.use_gpu))
示例#2
0
    def __init__(self, state_dim, action_dim, use_gpu=True):
        super(PolicyModel, self).__init__()

        self.std = torch.FloatTensor([1.0, 0.5, 0.5]).to(set_device(use_gpu))

        self.state_extractor = nn.Sequential(nn.Linear(1, 32), nn.ReLU())
        self.nn_layer = nn.Sequential(nn.Linear(160, 320), nn.ReLU(),
                                      nn.Linear(320, 128), nn.ReLU())

        self.critic_layer = nn.Sequential(nn.Linear(128, 1))
        self.actor_tanh_layer = nn.Sequential(nn.Linear(128, 1), nn.Tanh())
        self.actor_sigmoid_layer = nn.Sequential(nn.Linear(128, 2),
                                                 nn.Sigmoid())
示例#3
0
    def __init__(self,
                 Policy_Model,
                 Value_Model,
                 CnnModel,
                 ProjectionModel,
                 state_dim,
                 action_dim,
                 policy_dist,
                 policy_loss,
                 auxppg_loss,
                 auxclr_loss,
                 policy_memory,
                 auxppg_memory,
                 auxclr_memory,
                 ppo_epochs=10,
                 auxppg_epochs=10,
                 auxclr_epochs=10,
                 n_aux_update=2,
                 is_training_mode=True,
                 policy_kl_range=0.03,
                 policy_params=5,
                 value_clip=1.0,
                 entropy_coef=0.0,
                 vf_loss_coef=1.0,
                 batch_size=32,
                 learning_rate=3e-4,
                 folder='model',
                 use_gpu=True):

        self.policy_kl_range = policy_kl_range
        self.policy_params = policy_params
        self.value_clip = value_clip
        self.entropy_coef = entropy_coef
        self.vf_loss_coef = vf_loss_coef
        self.batch_size = batch_size
        self.ppo_epochs = ppo_epochs
        self.auxppg_epochs = auxppg_epochs
        self.auxclr_epochs = auxclr_epochs
        self.is_training_mode = is_training_mode
        self.action_dim = action_dim
        self.state_dim = state_dim
        self.learning_rate = learning_rate
        self.folder = folder
        self.use_gpu = use_gpu
        self.n_aux_update = n_aux_update

        self.device = set_device(self.use_gpu)

        self.policy = Policy_Model(state_dim, action_dim,
                                   self.use_gpu).float().to(self.device)
        self.policy_old = Policy_Model(state_dim, action_dim,
                                       self.use_gpu).float().to(self.device)

        self.value = Value_Model(state_dim).float().to(self.device)
        self.value_old = Value_Model(state_dim).float().to(self.device)

        self.cnn = CnnModel().float().to(self.device)
        self.auxclr_projection = ProjectionModel().float().to(self.device)

        self.policy_dist = policy_dist

        self.policy_memory = policy_memory
        self.auxppg_memory = auxppg_memory
        self.auxclr_memory = auxclr_memory

        self.policyLoss = policy_loss
        self.auxppgLoss = auxppg_loss
        self.auxclrLoss = auxclr_loss

        self.i_auxppg_update = 0
        self.i_ppo_update = 0

        self.ppo_optimizer = Adam(list(self.policy.parameters()) +
                                  list(self.value.parameters()) +
                                  list(self.cnn.parameters()),
                                  lr=learning_rate)
        self.auxppg_optimizer = Adam(list(self.policy.parameters()),
                                     lr=learning_rate)
        self.auxclr_optimizer = Adam(list(self.cnn.parameters()) +
                                     list(self.auxclr_projection.parameters()),
                                     lr=learning_rate)

        self.ppo_scaler = torch.cuda.amp.GradScaler()
        self.auxppg_scaler = torch.cuda.amp.GradScaler()
        self.auxclr_scaler = torch.cuda.amp.GradScaler()

        self.policy_old.load_state_dict(self.policy.state_dict())
        self.value_old.load_state_dict(self.value.state_dict())

        self.trans = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
        ])

        if is_training_mode:
            self.policy.train()
            self.value.train()
        else:
            self.policy.eval()
            self.value.eval()
示例#4
0
 def compute_loss(self, first_encoded, second_encoded):
     indexes     = torch.arange(first_encoded.shape[0]).long().to(set_device(self.use_gpu))   
     
     similarity  = torch.nn.functional.cosine_similarity(first_encoded.unsqueeze(1), second_encoded.unsqueeze(0), dim = 2)
     return torch.nn.functional.cross_entropy(similarity, indexes)
    def __init__(self,
                 Policy_Model,
                 Value_Model,
                 Q_Model,
                 CnnModel,
                 ProjectionModel,
                 state_dim,
                 action_dim,
                 policy_dist,
                 q_loss,
                 v_loss,
                 policy_loss,
                 auxclr_loss,
                 policy_memory,
                 auxclr_memory,
                 is_training_mode=True,
                 batch_size=32,
                 cql_epochs=4,
                 auxclr_epochs=4,
                 soft_tau=0.95,
                 learning_rate=3e-4,
                 folder='model',
                 use_gpu=True):

        self.batch_size = batch_size
        self.is_training_mode = is_training_mode
        self.action_dim = action_dim
        self.state_dim = state_dim
        self.learning_rate = learning_rate
        self.folder = folder
        self.use_gpu = use_gpu
        self.cql_epochs = cql_epochs
        self.auxclr_epochs = auxclr_epochs
        self.soft_tau = soft_tau

        self.device = set_device(self.use_gpu)

        self.soft_q1 = Q_Model(state_dim, action_dim,
                               self.use_gpu).float().to(self.device)
        self.soft_q2 = Q_Model(state_dim, action_dim,
                               self.use_gpu).float().to(self.device)
        self.value = Value_Model(state_dim,
                                 self.use_gpu).float().to(self.device)
        self.policy = Policy_Model(state_dim, action_dim,
                                   self.use_gpu).float().to(self.device)

        self.cnn = CnnModel().float().to(self.device)
        self.auxclr_projection = ProjectionModel().float().to(self.device)

        self.policy_dist = policy_dist

        self.policy_memory = policy_memory
        self.auxclr_memory = auxclr_memory

        self.qLoss = q_loss
        self.vLoss = v_loss
        self.policyLoss = policy_loss
        self.auxclrLoss = auxclr_loss

        self.soft_q_optimizer = Adam(list(self.soft_q1.parameters()) +
                                     list(self.soft_q2.parameters()),
                                     lr=learning_rate)
        self.auxclr_optimizer = Adam(list(self.cnn.parameters()) +
                                     list(self.auxclr_projection.parameters()),
                                     lr=learning_rate)
        self.value_optimizer = Adam(self.value.parameters(), lr=learning_rate)
        self.policy_optimizer = Adam(self.policy.parameters(),
                                     lr=learning_rate)

        self.soft_q_scaler = torch.cuda.amp.GradScaler()
        self.value_scaler = torch.cuda.amp.GradScaler()
        self.policy_scaler = torch.cuda.amp.GradScaler()
        self.auxclr_scaler = torch.cuda.amp.GradScaler()

        self.trans = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
        ])
示例#6
0
    def sample(self, datas):
        mean, std = datas

        distribution    = Normal(0, 1)
        rand            = distribution.sample().float().to(set_device(self.use_gpu))
        return (mean + std * rand).squeeze(0)
示例#7
0
    def logprob(self, datas, value_data):
        mean, std = datas

        distribution = Normal(mean, std)
        return distribution.log_prob(value_data).float().to(set_device(self.use_gpu))
示例#8
0
 def entropy(self, datas):
     mean, std = datas
     
     distribution = Normal(mean, std)
     return distribution.entropy().float().to(set_device(self.use_gpu))