class MLPPolicy(torch.nn.Module): def __init__(self, num_inputs, action_space): super(MLPPolicy, self).__init__() self.obs_filter = ObsNorm((1, num_inputs), clip=5) self.action_space = action_space self.a_fc1 = nn.Linear(num_inputs, 64, bias=False) self.a_ab1 = AddBias(64) self.a_fc2 = nn.Linear(64, 64, bias=False) self.a_ab2 = AddBias(64) self.a_fc_mean = nn.Linear(64, action_space.shape[0], bias=False) self.a_ab_mean = AddBias(action_space.shape[0]) self.a_ab_logstd = AddBias(action_space.shape[0]) self.v_fc1 = nn.Linear(num_inputs, 64, bias=False) self.v_ab1 = AddBias(64) self.v_fc2 = nn.Linear(64, 64, bias=False) self.v_ab2 = AddBias(64) self.v_fc3 = nn.Linear(64, 1, bias=False) self.v_ab3 = AddBias(1) self.apply(weights_init_mlp) tanh_gain = nn.init.calculate_gain('tanh') #self.a_fc1.weight.data.mul_(tanh_gain) #self.a_fc2.weight.data.mul_(tanh_gain) self.a_fc_mean.weight.data.mul_(0.01) #self.v_fc1.weight.data.mul_(tanh_gain) #self.v_fc2.weight.data.mul_(tanh_gain) self.train() def cuda(self, **args): super(MLPPolicy, self).cuda(**args) self.obs_filter.cuda() def cpu(self, **args): super(MLPPolicy, self).cpu(**args) self.obs_filter.cpu() def forward(self, inputs): inputs.data = self.obs_filter(inputs.data) x = self.v_fc1(inputs) x = self.v_ab1(x) x = F.tanh(x) x = self.v_fc2(x) x = self.v_ab2(x) x = F.tanh(x) x = self.v_fc3(x) x = self.v_ab3(x) value = x x = self.a_fc1(inputs) x = self.a_ab1(x) x = F.tanh(x) x = self.a_fc2(x) x = self.a_ab2(x) x = F.tanh(x) x = self.a_fc_mean(x) x = self.a_ab_mean(x) action_mean = x # An ugly hack for my KFAC implementation. zeros = Variable(torch.zeros(x.size()), volatile=x.volatile) if x.is_cuda: zeros = zeros.cuda() x = self.a_ab_logstd(zeros) action_logstd = x return value, action_mean, action_logstd def act(self, inputs, deterministic=False): value, action_mean, action_logstd = self(inputs) action_std = action_logstd.exp() noise = Variable(torch.randn(action_std.size())) if action_std.is_cuda: noise = noise.cuda() if deterministic is False: action = action_mean + action_std * noise else: action = action_mean return value, action def evaluate_actions(self, inputs, actions): assert inputs.dim( ) == 2, "Expect to have inputs in num_processes * num_steps x ... format" value, action_mean, action_logstd = self(inputs) action_std = action_logstd.exp() action_log_probs = -0.5 * ( (actions - action_mean) / action_std).pow(2) - 0.5 * math.log( 2 * math.pi) - action_logstd action_log_probs = action_log_probs.sum(1, keepdim=True) dist_entropy = 0.5 + math.log(2 * math.pi) + action_log_probs dist_entropy = dist_entropy.sum(-1).mean() return value, action_log_probs, dist_entropy
class MLPPolicy(FFPolicy): def __init__(self, num_inputs, action_space): super(MLPPolicy, self).__init__() self.obs_filter = ObsNorm((1, num_inputs), clip=5) self.action_space = action_space self.a_fc1 = nn.Linear(num_inputs, 64, bias=False) self.a_ab1 = AddBias(64) self.a_fc2 = nn.Linear(64, 64, bias=False) self.a_ab2 = AddBias(64) self.v_fc1 = nn.Linear(num_inputs, 64, bias=False) self.v_ab1 = AddBias(64) self.v_fc2 = nn.Linear(64, 64, bias=False) self.v_ab2 = AddBias(64) self.v_fc3 = nn.Linear(64, 1, bias=False) self.v_ab3 = AddBias(1) if action_space.__class__.__name__ == "Discrete": num_outputs = action_space.n self.dist = Categorical(64, num_outputs) elif action_space.__class__.__name__ == "Box": num_outputs = action_space.shape[0] self.dist = DiagGaussian(64, num_outputs) else: raise NotImplementedError self.apply(weights_init_mlp) tanh_gain = nn.init.calculate_gain('tanh') #self.a_fc1.weight.data.mul_(tanh_gain) #self.a_fc2.weight.data.mul_(tanh_gain) #self.v_fc1.weight.data.mul_(tanh_gain) #self.v_fc2.weight.data.mul_(tanh_gain) if action_space.__class__.__name__ == "Box": self.dist.fc_mean.weight.data.mul_(0.01) self.train() def cuda(self, **args): super(MLPPolicy, self).cuda(**args) self.obs_filter.cuda() def cpu(self, **args): super(MLPPolicy, self).cpu(**args) self.obs_filter.cpu() def forward(self, inputs): inputs.data = self.obs_filter(inputs.data) x = self.v_fc1(inputs) x = self.v_ab1(x) x = F.tanh(x) x = self.v_fc2(x) x = self.v_ab2(x) x = F.tanh(x) x = self.v_fc3(x) x = self.v_ab3(x) value = x x = self.a_fc1(inputs) x = self.a_ab1(x) x = F.tanh(x) x = self.a_fc2(x) x = self.a_ab2(x) x = F.tanh(x) return value, x