class RainbowDQN(nn.Module): def __init__(self, num_inputs, num_actions, num_atoms, Vmin, Vmax): super(RainbowDQN, self).__init__() self.num_inputs = num_inputs self.num_actions = num_actions self.num_atoms = num_atoms self.Vmin = Vmin self.Vmax = Vmax self.linear1 = nn.Linear(num_inputs, 32) self.linear2 = nn.Linear(32, 64) self.noisy_value1 = NoisyLinear(64, 64, use_cuda=USE_CUDA) self.noisy_value2 = NoisyLinear(64, self.num_atoms, use_cuda=USE_CUDA) self.noisy_advantage1 = NoisyLinear(64, 64, use_cuda=USE_CUDA) self.noisy_advantage2 = NoisyLinear(64, self.num_atoms * self.num_actions, use_cuda=USE_CUDA) def forward(self, x): batch_size = x.size(0) x = F.relu(self.linear1(x)) x = F.relu(self.linear2(x)) value = F.relu(self.noisy_value1(x)) value = self.noisy_value2(value) advantage = F.relu(self.noisy_advantage1(x)) advantage = self.noisy_advantage2(advantage) value = value.view(batch_size, 1, self.num_atoms) advantage = advantage.view(batch_size, self.num_actions, self.num_atoms) x = value + advantage - advantage.mean(1, keepdim=True) x = F.softmax(x.view(-1, self.num_atoms), dim=1).view(-1, self.num_actions, self.num_atoms) return x def reset_noise(self): self.noisy_value1.reset_noise() self.noisy_value2.reset_noise() self.noisy_advantage1.reset_noise() self.noisy_advantage2.reset_noise() def act(self, state, epsilon=0.0): if random.random() > epsilon: state = Variable(torch.FloatTensor(state).unsqueeze(0)) dist = self.forward(state).data.cpu() dist = dist * torch.linspace(self.Vmin, self.Vmax, self.num_atoms) action = dist.sum(2).max(1)[1].numpy()[0] else: action = -random.randrange(self.num_actions) return action
class RainbowCnnDQN(nn.Module): def __init__(self, input_shape, num_actions, num_atoms, Vmin, Vmax): super(RainbowCnnDQN, self).__init__() self.input_shape = input_shape self.num_actions = num_actions self.num_atoms = num_atoms self.Vmin = Vmin self.Vmax = Vmax #Definition of the neural network #Conv2d(input_shape, output_shape, kernel_size, stride) #(kernel_size = size of the convolutional filter) #(stride = step used to shift the convolutional filter) #ReLu --> max(0,x) self.features = nn.Sequential( nn.Conv2d(input_shape[0], 32, kernel_size=8, stride=4), nn.ReLU(), nn.Conv2d(32, 64, kernel_size=4, stride=2), nn.ReLU(), nn.Conv2d(64, 64, kernel_size=3, stride=1), nn.ReLU()) #NoisyLinear( in_features, out_features, use_cuda) #(Layers after the features) self.noisy_value1 = NoisyLinear(self.feature_size(), 512, use_cuda=USE_CUDA) self.noisy_value2 = NoisyLinear(512, self.num_atoms, use_cuda=USE_CUDA) self.noisy_advantage1 = NoisyLinear(self.feature_size(), 512, use_cuda=USE_CUDA) self.noisy_advantage2 = NoisyLinear(512, self.num_atoms * self.num_actions, use_cuda=USE_CUDA) # passes a state through the neural network, gives distributional output (1,actions,atoms) def forward(self, x): batch_size = x.size(0) #Colored pixels x = x / 255. #x goes through the NN x = self.features(x) #Reshape x with batch_size rows & adapted number of columns x = x.view(batch_size, -1) #Output num_atoms features lists value = F.relu(self.noisy_value1(x)) value = self.noisy_value2(value) #Output num_atoms * num_actions features lists advantage = F.relu(self.noisy_advantage1(x)) advantage = self.noisy_advantage2(advantage) #Reshape value & advantage value = value.view(batch_size, 1, self.num_atoms) advantage = advantage.view(batch_size, self.num_actions, self.num_atoms) #Factorization of action values : DUELING NETWORKS #(mean only over the different actions --> over 2d dimension of advantage) # here x = q(s,a) x = value + advantage - advantage.mean(1, keepdim=True) #DISTRIBUTIONAL RL # softmax => for each action, returns num_atoms probabilities x = F.softmax(x.view(-1, self.num_atoms), dim=1).view(-1, self.num_actions, self.num_atoms) #dim : ( 1, num_actions, num_atoms) return x def reset_noise(self): self.noisy_value1.reset_noise() self.noisy_value2.reset_noise() self.noisy_advantage1.reset_noise() self.noisy_advantage2.reset_noise() #Size of the output of the features (before passing through noisynets) def feature_size(self): return self.features( autograd.Variable(torch.zeros(1, *self.input_shape))).view(1, -1).size(1) #returns the index of the best action to choose def act(self, state): #unsqueeze(0) = adds a "1" dimension at index 0 of shape ; make it float. #Volatile : doesn't need much memory because we won't do any backprop state = Variable(torch.FloatTensor(np.float32(state)).unsqueeze(0), volatile=True) #.cpu() moves the tensor to the cpu #.data : content of the tensor dist = self.forward(state).data.cpu() #torch.linspace : 1D tensor length num_atoms, equally spaced points in [Vmin,Vmax] #dist : shape = ( ?, num_actions, num_atoms) # * => multiplies each (1,num_actions) matrix by a number in [Vmin,Vmax] dist = dist * torch.linspace(self.Vmin, self.Vmax, self.num_atoms) #dist.sum(2) = sum over dimension 2 => shape(?,num_actions) # => sum the results of all the atoms # .max(1) => maximizes on the different actions # [1] => indices of the max* # .numpy() => transforms into an array # [0] => index of the action to choose action = dist.sum(2).max(1)[1].numpy()[0] return action
class RainbowDQN(nn.Module): def __init__(self, input_shape, num_actions, num_atoms, Vmin, Vmax): super(RainbowDQN, self).__init__() self.input_shape = input_shape self.num_actions = num_actions self.num_atoms = num_atoms self.Vmin = Vmin self.Vmax = Vmax self.features = nn.Sequential( # ((84 - 8 - 2*0) / 4) + 1 = 20 nn.Conv2d(input_shape[0], 32, kernel_size=8, stride=4), # batch_size x 32 x 20 x 20 nn.ReLU(), # ((20 - 4 - 2*0) / 2) + 1 = 9 nn.Conv2d(32, 64, kernel_size=4, stride=2), # batch_size x 64 x 9 x 9 nn.ReLU(), # ((9 - 3 - 2*0) / 2) + 1 = 4 nn.Conv2d(64, 64, kernel_size=3, stride=1), # batch_size x 64 x 4 x 4 nn.ReLU()) self.noisy_value1 = NoisyLinear(self.feature_size(), 512, use_cuda=USE_CUDA) self.noisy_value2 = NoisyLinear(512, self.num_atoms, use_cuda=USE_CUDA) self.noisy_advantage1 = NoisyLinear(self.feature_size(), 512, use_cuda=USE_CUDA) self.noisy_advantage2 = NoisyLinear(512, self.num_atoms * self.num_actions, use_cuda=USE_CUDA) def forward(self, x): batch_size = x.size(0) x = x / 255. x = self.features(x) x = x.view(batch_size, -1) value = F.relu(self.noisy_value1(x)) value = self.noisy_value2(value) advantage = F.relu(self.noisy_advantage1(x)) advantage = self.noisy_advantage2(advantage) value = value.view(batch_size, 1, self.num_atoms) advantage = advantage.view(batch_size, self.num_actions, self.num_atoms) x = value + advantage - advantage.mean(1, keepdim=True) x = F.softmax(x.view(-1, self.num_atoms)).view(-1, self.num_actions, self.num_atoms) return x def reset_noise(self): self.noisy_value1.reset_noise() self.noisy_value2.reset_noise() self.noisy_advantage1.reset_noise() self.noisy_advantage2.reset_noise() def feature_size(self): return self.features( autograd.Variable(torch.zeros(1, *self.input_shape))).view(1, -1).size(1) def act(self, state): state = Variable(torch.FloatTensor(np.float32(state)).unsqueeze(0), volatile=True) dist = self.forward(state).data.cpu() dist = dist * torch.linspace(self.Vmin, self.Vmax, self.num_atoms) action = dist.sum(2).max(1)[1].numpy()[0] return action
class RainbowDQN(nn.Module): def __init__(self, num_inputs, num_actions, num_atoms, Vmin, Vmax): super(RainbowDQN, self).__init__() self.num_inputs = num_inputs self.num_actions = num_actions self.num_atoms = num_atoms self.Vmin = Vmin self.Vmax = Vmax # Markovian numNodes = 16 # NonMarkovian # numNodes = 48 self.linear1 = nn.Linear(num_inputs, numNodes) self.linear2 = nn.Linear(numNodes, numNodes) # numNodes = 16 self.noisy_value1 = NoisyLinear(numNodes, numNodes, use_cuda=USE_CUDA) self.noisy_value2 = NoisyLinear(numNodes, self.num_atoms, use_cuda=USE_CUDA) self.noisy_advantage1 = NoisyLinear(numNodes, numNodes, use_cuda=USE_CUDA) self.noisy_advantage2 = NoisyLinear(numNodes, self.num_atoms * self.num_actions, use_cuda=USE_CUDA) def forward(self, x): batch_size = x.size(0) x = F.relu(self.linear1(x)) x = F.relu(self.linear2(x)) value = F.relu(self.noisy_value1(x)) value = self.noisy_value2(value) advantage = F.relu(self.noisy_advantage1(x)) advantage = self.noisy_advantage2(advantage) value = value.view(batch_size, 1, self.num_atoms) advantage = advantage.view(batch_size, self.num_actions, self.num_atoms) x = value + advantage - advantage.mean(1, keepdim=True) x = F.softmax(x.view(-1, self.num_atoms)).view(-1, self.num_actions, self.num_atoms) return x def reset_noise(self): self.noisy_value1.reset_noise() self.noisy_value2.reset_noise() self.noisy_advantage1.reset_noise() self.noisy_advantage2.reset_noise() def act(self, state): state = Variable(torch.FloatTensor(state).unsqueeze(0), volatile=True) dist = self.forward(state).data.cpu() dist = dist * torch.linspace(self.Vmin, self.Vmax, self.num_atoms) action = dist.sum(2).max(1)[1].numpy()[0] return action