def __init__(self, input_size, hidden_sizes, output_size, act_func='sigmoid', train_alg='batch'): """ Parameters: ------------------ - input_size: integer, the number of features in the input - hidden_sizes: a list of integers, a list object containing number of units for hidden layers - output_size: an integer, the length of output vector - act_func: string, name of activation function to use for each hidden layer - train_alg: string, allowed values are {'batch', 'reweight', 'naive'} """ super(MLP, self).__init__() self.input_size = input_size layer_sizes = [input_size] + hidden_sizes self.linears = nn.ModuleList([ Linear(in_size, out_size, bias=True) for in_size, out_size in zip(layer_sizes[:-1], layer_sizes[1:]) ]) self.output_layer = Linear(hidden_sizes[-1], output_size, bias=True) self.act = activation[act_func] self.train_alg = train_alg # list of layers in the network self.layers = [layer for layer in self.linears] self.layers.append(self.output_layer)
def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1, activation="relu", pe_grad=True): super(TransformerEncoderLayer, self).__init__() self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout) # Implementation of Feedforward model self.linear1 = Linear(d_model, dim_feedforward) self.dropout = nn.Dropout(dropout) self.linear2 = Linear(dim_feedforward, d_model) self.norm1 = LayerNorm(d_model) self.norm2 = LayerNorm(d_model) self.dropout1 = nn.Dropout(dropout) self.dropout2 = nn.Dropout(dropout) self.activation = _get_activation_fn(activation) self._pe_modules = [ self.self_attn, self.linear1, self.linear2, self.norm1, self.norm2 ]
def __init__(self, input_size, hidden_size, num_classes, train_alg='batch'): super(type(self), self).__init__() self.input_size = input_size self.hidden_size = hidden_size self.output_size = num_classes self.train_alg = train_alg self.rnn = RNNCell(input_size, hidden_size) self.fc = Linear(self.hidden_size, self.output_size)
class SimpleRNN(PeGradNet): def __init__(self, input_size, hidden_size, num_classes, train_alg='batch'): super(type(self), self).__init__() self.input_size = input_size self.hidden_size = hidden_size self.output_size = num_classes self.train_alg = train_alg self.rnn = RNNCell(input_size, hidden_size) self.fc = Linear(self.hidden_size, self.output_size) def forward(self, x): x = x.squeeze(1).permute(1, 0, 2) # seq_len x batch_size x input_size self.rnn.reset_pgrad() hx = torch.zeros(x.shape[1], self.hidden_size, device=x.device) for t in range(x.shape[0]): hx = self.rnn(x[t], hx) logits = self.fc(hx) return logits def per_example_gradient(self, loss): grads = [] pre_acts = self.rnn.pre_activation pre_acts.append(self.fc.pre_activation) Z_grad = torch.autograd.grad(loss, pre_acts, retain_graph=True) grads.extend(self.rnn.per_example_gradient(Z_grad[:-1])) grads.extend(self.fc.per_example_gradient(Z_grad[-1])) return grads def pe_grad_norm(self, loss, batch_size, device): grad_norm = torch.zeros(batch_size, device=device, requires_grad=False) pre_acts = self.rnn.pre_activation pre_acts.append(self.fc.pre_activation) Z_grad = torch.autograd.grad(loss, pre_acts, retain_graph=True) grad_norm.add_(self.rnn.pe_grad_sqnorm(Z_grad[:-1])) grad_norm.add_(self.fc.pe_grad_sqnorm(Z_grad[-1])) grad_norm.sqrt_() return grad_norm
def __init__(self, input_size, channel_sizes, kernel_sizes, fc_sizes, num_classes, train_alg='batch'): super(type(self), self).__init__() self.input_size = input_size self.kernel_sizes = kernel_sizes self.act = F.relu self.pooling = nn.MaxPool2d(kernel_size=2, stride=2, padding=0) # convolutional layers layers = [] out_size = input_size for c_in, c_out, k in zip(channel_sizes[:-1], channel_sizes[1:], kernel_sizes): layer = Conv2d(c_in, c_out, k) layers.append(layer) out_size = conv_outsize(out_size, k, layer.padding[0], layer.stride[0]) out_size = conv_outsize(out_size, self.pooling.kernel_size, self.pooling.padding, self.pooling.stride) self.convs = nn.ModuleList(layers) self.conv_outsize = out_size * out_size * c_out # fully-connected layers fc_sizes = [self.conv_outsize] + fc_sizes self.linears = nn.ModuleList([ Linear(in_size, out_size) for in_size, out_size in zip(fc_sizes[:-1], fc_sizes[1:]) ]) self.output_layer = Linear(fc_sizes[-1], num_classes) self.layers = [layer for layer in self.convs] self.layers += [layer for layer in self.linears] self.layers.append(self.output_layer) self.train_alg = train_alg
def __init__(self, n_token, n_classes, d_model=512, n_layers=2, n_head=8, n_hidden=2048, dropout=0.1, max_seq_len=512, embeddings=None, train_alg='batch'): super(TransformerModel, self).__init__() self.train_alg = train_alg self.d_model = d_model self.n_head = n_head if embeddings is None: self.token_embedding = nn.Embedding(n_token, d_model) else: self.token_embedding = nn.Embedding.from_pretrained(embeddings) self.token_embedding.weight.requires_grad = False self.pos_encoder = PositionalEncoding(d_model, dropout, max_seq_len) encoder_layers = TransformerEncoderLayer(d_model, n_head, n_hidden, dropout) # encoder_norm = nn.LayerNorm(d_model) encoder_norm = None self.encoder = TransformerEncoder(encoder_layers, n_layers, encoder_norm) self.fc= Linear(d_model, n_classes)
def __init__(self, output_size, cfg='A', train_alg='batch', batch_norm=False, pre_trained=False, init_weights=True): super(VGG, self).__init__() self.layers = [] self.features = make_layers(cfgs[cfg], self.layers, batch_norm) self.avgpool = nn.AdaptiveAvgPool2d((7, 7)) self.classifier = Sequential( Linear(512 * 7 * 7, 4096), nn.ReLU(True), nn.Dropout(), Linear(4096, 4096), nn.ReLU(True), nn.Dropout(), Linear(4096, output_size) ) self.layers += [self.classifier[0], self.classifier[3], self.classifier[6]] if init_weights: self._initialize_weights() self.train_alg = train_alg
def __init__(self, input_size, hidden_size, num_classes, num_layers=1, train_alg='batch', bias=True): super(RNN, self).__init__() self.hidden_size = hidden_size # self.rnn = nn.RNN(input_size, hidden_size, num_layers=num_layers, # nonlinearity='tanh') self.rnn = RNNModule(input_size, hidden_size) self.output_layer = Linear(hidden_size, num_classes) self.train_alg = train_alg self.layers = [self.rnn, self.output_layer]
def __init__(self, block, layers, num_classes=10, zero_init_residual=False, norm_layer=None, train_alg='batch'): super(ResNet, self).__init__() self.train_alg = train_alg self.inplanes = 64 self.dialation = 1 if norm_layer is None: norm_layer = nn.BatchNorm2d self._norm_layer = norm_layer self.conv1 = Conv2d(3, self.inplanes, kernel_size=6, stride=2, padding=3, bias=False) self.bn1 = norm_layer(self.inplanes) self.relu = nn.ReLU(inplace=True) self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) self.layer1 = self._make_layer(block, 64, layers[0]) self.layer2 = self._make_layer(block, 128, layers[1], stride=2) self.layer3 = self._make_layer(block, 256, layers[2], stride=2) self.layer4 = self._make_layer(block, 512, layers[3], stride=2) self.avgpool = nn.AdaptiveAvgPool2d((1, 1)) self.fc = Linear(512 * block.expansion, num_classes) for m in self.modules(): if isinstance(m, nn.Conv2d): nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)): nn.init.constant_(m.weight, 1) nn.init.constant_(m.bias, 0) # Zero-initialize the last BN in each residual branch, # so that the residual branch starts with zeros, and each residual block behaves like an identity. # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677 if zero_init_residual: for m in self.modules(): if isinstance(m, Bottleneck): nn.init.constant_(m.bn3.weight, 0) elif isinstance(m, BasicBlock): nn.init.constant_(m.bn2.weight, 0) # collecting layers whose per-example gradients need to be computed self.layers = [self.conv1] add_pegrad_layers(self.layer1, self.layers) add_pegrad_layers(self.layer2, self.layers) add_pegrad_layers(self.layer3, self.layers) add_pegrad_layers(self.layer4, self.layers) self.layers.append(self.fc)
def __init__(self, embed_dim, num_heads, dropout=0., bias=True, add_bias_kv=False, add_zero_attn=False): super(MultiheadAttention, self).__init__() self.embed_dim = embed_dim self.num_heads = num_heads self.dropout = dropout self.head_dim = embed_dim // num_heads assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads" self.in_proj = Linear(embed_dim, 3 * embed_dim) # self.in_proj_weight = Parameter(torch.empty(3 * embed_dim, embed_dim)) # if bias: # self.in_proj_bias = Parameter(torch.empty(3 * embed_dim)) # else: # self.register_parameter('in_proj_bias', None) self.out_proj = Linear(embed_dim, embed_dim) self._reset_parameters()
class SimpleLSTM(PeGradNet): def __init__(self, input_size, hidden_size, output_size, train_alg='batch'): super(SimpleLSTM, self).__init__() self.lstm = LSTMCell(input_size, hidden_size) self.fc = Linear(hidden_size, output_size) self.train_alg = train_alg def forward(self, x, init_states=None): # x = x.squeeze(1) batch_size = x.shape[0] x = x.reshape(batch_size, x.shape[2], -1) seq_size = x.shape[1] hidden_size = self.lstm.hidden_size self.lstm.reset_pgrad() if init_states is None: h_t, c_t = (torch.zeros(batch_size, hidden_size, device=x.device), torch.zeros(batch_size, hidden_size, device=x.device)) else: h_t, c_t = init_states for t in range(seq_size): x_t = x[:, t, :] h_t, c_t = self.lstm(x_t, h_t, c_t) logits = self.fc(h_t) return logits def pe_grad_norm(self, loss, batch_size, device): grad_norm = torch.zeros(batch_size, device=device) pre_acts = self.lstm.pre_activation pre_acts.append(self.fc.pre_activation) Z_grad = torch.autograd.grad(loss, pre_acts, retain_graph=True) grad_norm.add_(self.lstm.pe_grad_sqnorm(Z_grad[:-1])) grad_norm.add_(self.fc.pe_grad_sqnorm(Z_grad[-1])) grad_norm.sqrt_() return grad_norm
class TransformerModel(nn.Module): def __init__(self, n_token, n_classes, d_model=512, n_layers=2, n_head=8, n_hidden=2048, dropout=0.1, max_seq_len=512, embeddings=None, train_alg='batch'): super(TransformerModel, self).__init__() self.train_alg = train_alg self.d_model = d_model self.n_head = n_head if embeddings is None: self.token_embedding = nn.Embedding(n_token, d_model) else: self.token_embedding = nn.Embedding.from_pretrained(embeddings) self.token_embedding.weight.requires_grad = False self.pos_encoder = PositionalEncoding(d_model, dropout, max_seq_len) encoder_layers = TransformerEncoderLayer(d_model, n_head, n_hidden, dropout) # encoder_norm = nn.LayerNorm(d_model) encoder_norm = None self.encoder = TransformerEncoder(encoder_layers, n_layers, encoder_norm) self.fc= Linear(d_model, n_classes) def init_weights(self): initrange = 0.1 self.token_embedding.weight.data.uniform_(-initrange, initrange) self.fc.bias.data.zero_() self.fc.weight.data.uniform_(-initrange, initrange) def forward(self, x): # positions = torch.arange(len(x), device=x.device).unsqueeze(-1) x = x.transpose(0, 1) # [sentence length, batch_size] x = self.token_embedding(x) # [sentence length, batch_size, embedding dim] x = self.pos_encoder(x) # x = x + self.pos_encoder(positions).expand_as(x) # [sentence length, batch_size, embedding dim] output = self.encoder(x) # [sentence length, batch_size, embedding dim] avg_out = output.transpose(0, 1).mean(dim=1) # [batch_size, embedding dim] preact = self.fc(avg_out) # [batch_size, num_classes] # return F.log_softmax(output, dim=-1) return preact def per_example_gradient(self, loss): grads = [] pre_acts = [] pre_acts.extend(self.encoder.collect_preactivations()) pre_acts.append(self.fc.pre_activation) pre_acts = [m.pre_activ for m in modules] Z_grad = torch.autograd.grad(loss, pre_acts, retain_graph=True) for m, zgrad in zip(modules, Z_grad): m.save_grad(zgrad) # loss.backward(retain_graph=True) # TransformerEncoder grads.extend(self.encoder.per_example_gradient()) # fully connected layer grads.extend(self.fc.per_example_gradient()) return grads def pe_grad_norm(self, loss, batch_size, device): grad_norm = torch.zeros(batch_size, device=device) pre_acts = [] pre_acts.extend(self.encoder.collect_preactivations()) pre_acts.append(self.fc.pre_activation) Z_grad = torch.autograd.grad(loss, pre_acts, retain_graph=True) grad_norm.add_(self.encoder.pe_grad_sqnorm(Z_grad[:-1])) grad_norm.add_(self.fc.pe_grad_sqnorm(Z_grad[-1])) grad_norm.sqrt_() return grad_norm
def __init__(self, input_size, hidden_size, output_size, train_alg='batch'): super(SimpleLSTM, self).__init__() self.lstm = LSTMCell(input_size, hidden_size) self.fc = Linear(hidden_size, output_size) self.train_alg = train_alg
class TransformerEncoderLayer(nn.Module): def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1, activation="relu", pe_grad=True): super(TransformerEncoderLayer, self).__init__() self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout) # Implementation of Feedforward model self.linear1 = Linear(d_model, dim_feedforward) self.dropout = nn.Dropout(dropout) self.linear2 = Linear(dim_feedforward, d_model) self.norm1 = LayerNorm(d_model) self.norm2 = LayerNorm(d_model) self.dropout1 = nn.Dropout(dropout) self.dropout2 = nn.Dropout(dropout) self.activation = _get_activation_fn(activation) self._pe_modules = [ self.self_attn, self.linear1, self.linear2, self.norm1, self.norm2 ] def forward(self, src, src_mask=None, src_key_padding_mask=None): src = self.self_attn(src, src, src, attn_mask=src_mask, key_padding_mask=src_key_padding_mask) src = src + self.dropout1(src) src = self.norm1(src) if hasattr(self, "activation"): src = self.linear2(self.dropout(self.activation( self.linear1(src)))) else: # for backward compatibility src = self.linear2(self.dropout(F.relu(self.linear1(src)))) out = src + self.dropout2(src) out = self.norm2(out) return out def per_example_gradient(self): grads = [] for m in self._pe_modules: grads.extend(m.per_example_gradient()) return grads def pe_grad_sqnorm(self, deriv_pre_activ): batch_size = deriv_pre_activ[0].size(1) device = deriv_pre_activ[0].device grad_sq_norm = torch.zeros(batch_size, device=device) grad_sq_norm.add_(self.self_attn.pe_grad_sqnorm(deriv_pre_activ[:2])) grad_sq_norm.add_(self.linear1.pe_grad_sqnorm(deriv_pre_activ[2])) grad_sq_norm.add_(self.linear2.pe_grad_sqnorm(deriv_pre_activ[3])) grad_sq_norm.add_(self.norm1.pe_grad_sqnorm(deriv_pre_activ[4])) grad_sq_norm.add_(self.norm2.pe_grad_sqnorm(deriv_pre_activ[5])) return grad_sq_norm def collect_preactivations(self): pre_acts = [] pre_acts.extend(self.self_attn.collect_preactivations()) pre_acts.append(self.linear1.pre_activation) pre_acts.append(self.linear2.pre_activation) pre_acts.append(self.norm1.pre_activation) pre_acts.append(self.norm2.pre_activation) return pre_acts
class MultiheadAttention(nn.Module): def __init__(self, embed_dim, num_heads, dropout=0., bias=True, add_bias_kv=False, add_zero_attn=False): super(MultiheadAttention, self).__init__() self.embed_dim = embed_dim self.num_heads = num_heads self.dropout = dropout self.head_dim = embed_dim // num_heads assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads" self.in_proj = Linear(embed_dim, 3 * embed_dim) # self.in_proj_weight = Parameter(torch.empty(3 * embed_dim, embed_dim)) # if bias: # self.in_proj_bias = Parameter(torch.empty(3 * embed_dim)) # else: # self.register_parameter('in_proj_bias', None) self.out_proj = Linear(embed_dim, embed_dim) self._reset_parameters() def _reset_parameters(self): xavier_uniform_(self.in_proj.weight) if self.in_proj.bias is not None: constant_(self.in_proj.bias, 0.) constant_(self.out_proj.bias, 0.) def forward(self, query, key, value, key_padding_mask=None, need_weights=True, attn_mask=None): attn_out, _ = multi_head_attention_forward( query, key, value, self.embed_dim, self.num_heads, self.in_proj, self.dropout, self.out_proj, training=self.training, key_padding_mask=key_padding_mask, need_weights=need_weights, attn_mask=attn_mask) return attn_out def per_example_gradient(self, deriv_pre_activ_in, deriv_pre_activ_out): pe_grad_weight_in, pe_grad_bias_in = \ self.in_proj.per_example_gradient(deriv_pre_activ_in) pe_grad_weight_out, pe_grad_bias_out = \ self.out_proj.per_example_gradient(deriv_pre_activ_out) return (pe_grad_weight_in, pe_grad_bias_in, pe_grad_weight_out, pe_grad_bias_out) def pe_grad_sqnorm(self, deriv_pre_activ): grads = self.per_example_gradient(*deriv_pre_activ) batch_size = grads[0].size(0) grad_sq_norm = torch.zeros(batch_size, device=grads[0].device) for grad in grads: grad_sq_norm.add_(grad.pow(2).view(batch_size, -1).sum(1)) return grad_sq_norm def collect_preactivations(self): return (self.in_proj.pre_activation, self.out_proj.pre_activation)