def reset_parameters(self): for cell in self._cells: # xavier initilization gate_size = self.hidden_size / 4 for weight in [cell.weight_ih, cell.weight_hh]: for w in torch.chunk(weight, 4, dim=0): init.xavier_normal_(w) #forget bias = 1 for bias in [cell.bias_ih, cell.bias_hh]: torch.chunk(bias, 4, dim=0)[1].data.fill_(1)
def __init__(self, input_dim, n_hidden, n_layer, dropout, n_hop): super().__init__() self._init_h = nn.Parameter(torch.Tensor(n_layer, n_hidden)) self._init_c = nn.Parameter(torch.Tensor(n_layer, n_hidden)) self._init_i = nn.Parameter(torch.Tensor(input_dim)) init.uniform_(self._init_h, -INI, INI) init.uniform_(self._init_c, -INI, INI) init.uniform_(self._init_i, -0.1, 0.1) self._lstm = nn.LSTM( input_dim, n_hidden, n_layer, bidirectional=False, dropout=dropout ) self._lstm_cell = None # attention parameters self._attn_wm = nn.Parameter(torch.Tensor(input_dim, n_hidden)) self._attn_wq = nn.Parameter(torch.Tensor(n_hidden, n_hidden)) self._attn_v = nn.Parameter(torch.Tensor(n_hidden)) init.xavier_normal_(self._attn_wm) init.xavier_normal_(self._attn_wq) init.uniform_(self._attn_v, -INI, INI) # hop parameters self._hop_wm = nn.Parameter(torch.Tensor(input_dim, n_hidden)) self._hop_wq = nn.Parameter(torch.Tensor(n_hidden, n_hidden)) self._hop_v = nn.Parameter(torch.Tensor(n_hidden)) init.xavier_normal_(self._hop_wm) init.xavier_normal_(self._hop_wq) init.uniform_(self._hop_v, -INI, INI) self._n_hop = n_hop
def __init__(self, vocab_size, emb_dim, n_hidden, bidirectional, n_layer, dropout=0.0): super().__init__() # embedding weight parameter is shared between encoder, decoder, # and used as final projection layer to vocab logit # can initialize with pretrained word vectors self._embedding = nn.Embedding(vocab_size, emb_dim, padding_idx=0) self._enc_lstm = nn.LSTM( emb_dim, n_hidden, n_layer, bidirectional=bidirectional, dropout=dropout ) # initial encoder LSTM states are learned parameters state_layer = n_layer * (2 if bidirectional else 1) self._init_enc_h = nn.Parameter( torch.Tensor(state_layer, n_hidden) ) self._init_enc_c = nn.Parameter( torch.Tensor(state_layer, n_hidden) ) init.uniform_(self._init_enc_h, -INIT, INIT) init.uniform_(self._init_enc_c, -INIT, INIT) # vanillat lstm / LNlstm self._dec_lstm = MultiLayerLSTMCells( 2*emb_dim, n_hidden, n_layer, dropout=dropout ) # project encoder final states to decoder initial states enc_out_dim = n_hidden * (2 if bidirectional else 1) self._dec_h = nn.Linear(enc_out_dim, n_hidden, bias=False) self._dec_c = nn.Linear(enc_out_dim, n_hidden, bias=False) # multiplicative attention self._attn_wm = nn.Parameter(torch.Tensor(enc_out_dim, n_hidden)) self._attn_wq = nn.Parameter(torch.Tensor(n_hidden, n_hidden)) init.xavier_normal_(self._attn_wm) init.xavier_normal_(self._attn_wq) # project decoder output to emb_dim, then # apply weight matrix from embedding layer self._projection = nn.Sequential( nn.Linear(2*n_hidden, n_hidden), nn.Tanh(), nn.Linear(n_hidden, emb_dim, bias=False) ) # functional object for easier usage self._decoder = AttentionalLSTMDecoder( self._embedding, self._dec_lstm, self._attn_wq, self._projection )
def init_func(m): classname = m.__class__.__name__ if hasattr(m, 'weight') and (classname.find('Conv') != -1 or classname.find('Linear') != -1): if init_type == 'normal': init.normal_(m.weight.data, 0.0, gain) elif init_type == 'xavier': init.xavier_normal_(m.weight.data, gain=gain) elif init_type == 'kaiming': init.kaiming_normal_(m.weight.data, a=0, mode='fan_in') elif init_type == 'orthogonal': init.orthogonal_(m.weight.data, gain=gain) else: raise NotImplementedError('initialization method [%s] is not implemented' % init_type) if hasattr(m, 'bias') and m.bias is not None: init.constant_(m.bias.data, 0.0) elif classname.find('BatchNorm2d') != -1: init.normal_(m.weight.data, 1.0, gain) init.constant_(m.bias.data, 0.0)
def re_init_head(self): inits.xavier_normal_(self.cls_classifier.weight) return
def weights_init(m): classname = m.__class__.__name__ if classname.find('Conv2d') != -1: init.xavier_normal_(m.weight.data)
def linear_layer_weights_xavier_initialisation(self, layer): if isinstance(layer, nn.Linear): xavier_normal_(layer.weight.data)
def init_weight(self): # init.xavier_normal_(self.hidden_proj.weight) init.xavier_normal_(self.gru.weight_hh_l0) init.xavier_normal_(self.gru.weight_ih_l0) self.gru.bias_ih_l0.data.fill_(0.0) self.gru.bias_hh_l0.data.fill_(0.0)
def init_weights(m): if type(m) == nn.Linear: init.xavier_normal_(m.weight.data) init.normal_(m.bias.data)
def reset_parameters(self): init.xavier_normal_(self.weight.detach()) if self.bias is not None: self.bias.detach().zero_()
def __init__(self, in_features, out_features, bias=True): super(Linear, self).__init__() self.linear = nn.Linear(in_features, out_features, bias=bias) init.xavier_normal_(self.linear.weight)
def _weight_init(self, m) -> None: """Initialize model weights with custom distributions. Do not use this method but call ``MyModel.weight_init()``. """ if isinstance(m, nn.Conv1d): init.normal_(m.weight.data) if m.bias is not None: init.normal_(m.bias.data) elif isinstance(m, nn.Conv2d): init.xavier_normal_(m.weight.data) if m.bias is not None: init.normal_(m.bias.data) elif isinstance(m, nn.Conv3d): init.xavier_normal_(m.weight.data) if m.bias is not None: init.normal_(m.bias.data) elif isinstance(m, nn.ConvTranspose1d): init.normal_(m.weight.data) if m.bias is not None: init.normal_(m.bias.data) elif isinstance(m, nn.ConvTranspose2d): init.xavier_normal_(m.weight.data) if m.bias is not None: init.normal_(m.bias.data) elif isinstance(m, nn.ConvTranspose3d): init.xavier_normal_(m.weight.data) if m.bias is not None: init.normal_(m.bias.data) elif isinstance(m, nn.BatchNorm1d): init.normal_(m.weight.data, mean=1, std=0.02) init.constant_(m.bias.data, 0) elif isinstance(m, nn.BatchNorm2d): init.normal_(m.weight.data, mean=1, std=0.02) init.constant_(m.bias.data, 0) elif isinstance(m, nn.BatchNorm3d): init.normal_(m.weight.data, mean=1, std=0.02) init.constant_(m.bias.data, 0) elif isinstance(m, nn.Linear): init.xavier_normal_(m.weight.data) if m.bias is not None: init.normal_(m.bias.data) elif isinstance(m, nn.LSTM): for param in m.parameters(): if len(param.shape) >= 2: init.orthogonal_(param.data) else: init.normal_(param.data) elif isinstance(m, nn.LSTMCell): for param in m.parameters(): if len(param.shape) >= 2: init.orthogonal_(param.data) else: init.normal_(param.data) elif isinstance(m, nn.GRU): for param in m.parameters(): if len(param.shape) >= 2: init.orthogonal_(param.data) else: init.normal_(param.data) elif isinstance(m, nn.GRUCell): for param in m.parameters(): if len(param.shape) >= 2: init.orthogonal_(param.data) else: init.normal_(param.data)
def get_param(shape): param = Parameter(torch.Tensor(*shape)) xavier_normal_(param.data) return param
def __init__(self, skip_sigmoid=False): super(Model2CNN, self).__init__() self.skip_sigmoid = skip_sigmoid #---- First Block ------------------------------------------------------# self.conv1 = nn.Conv2d(in_channels=1, out_channels=8, kernel_size=3, padding=1) #self.conv1_normed = nn.BatchNorm2d(8) torch_init.xavier_normal_(self.conv1.weight) self.conv2 = nn.Conv2d(in_channels=8, out_channels=8, kernel_size=3, padding=1) self.conv2_normed = nn.BatchNorm2d(8) torch_init.xavier_normal_(self.conv2.weight) self.conv3 = nn.Conv2d(in_channels=8, out_channels=8, kernel_size=3, padding=1) #self.conv3_normed = nn.BatchNorm2d(8) torch_init.xavier_normal_(self.conv3.weight) #---- Second Block -----------------------------------------------------# self.conv4 = nn.Conv2d(in_channels=8, out_channels=16, kernel_size=3, padding=1) #self.conv4_normed = nn.BatchNorm2d(16) torch_init.xavier_normal_(self.conv4.weight) self.conv5 = nn.Conv2d(in_channels=16, out_channels=16, kernel_size=3, padding=1) self.conv5_normed = nn.BatchNorm2d(16) torch_init.xavier_normal_(self.conv5.weight) self.conv6 = nn.Conv2d(in_channels=16, out_channels=16, kernel_size=3, padding=1) #self.conv6_normed = nn.BatchNorm2d(16) torch_init.xavier_normal_(self.conv6.weight) #---- Third Block -----------------------------------------------------# self.conv7 = nn.Conv2d(in_channels=16, out_channels=32, kernel_size=3, padding=1) #self.conv7_normed = nn.BatchNorm2d(32) torch_init.xavier_normal_(self.conv7.weight) self.conv8 = nn.Conv2d(in_channels=32, out_channels=32, kernel_size=3, padding=1) self.conv8_normed = nn.BatchNorm2d(32) torch_init.xavier_normal_(self.conv8.weight) self.conv9 = nn.Conv2d(in_channels=32, out_channels=32, kernel_size=3, padding=1) #self.conv9_normed = nn.BatchNorm2d(32) torch_init.xavier_normal_(self.conv9.weight) #---- Fourth Block -----------------------------------------------------# self.conv10 = nn.Conv2d(in_channels=32, out_channels=32, kernel_size=3, padding=1) #self.conv10_normed = nn.BatchNorm2d(32) torch_init.xavier_normal_(self.conv10.weight) self.conv11 = nn.Conv2d(in_channels=32, out_channels=32, kernel_size=3, padding=1) self.conv11_normed = nn.BatchNorm2d(32) torch_init.xavier_normal_(self.conv11.weight) self.conv12 = nn.Conv2d(in_channels=32, out_channels=32, kernel_size=3, padding=1) #self.conv12_normed = nn.BatchNorm2d(32) torch_init.xavier_normal_(self.conv12.weight) #---- END Blocks -------------------------------------------------------# # apply max-pooling with a [2x2] kernel using tiling (*NO SLIDING WINDOW*) self.pool = nn.MaxPool2d(kernel_size=2, stride=2) # fully connected layers self.fc1 = nn.Linear(in_features=(8192), out_features=128) torch_init.xavier_normal_(self.fc1.weight) # out_features = # of possible diseases self.fc2 = nn.Linear(in_features=128, out_features=14).cuda() torch_init.xavier_normal_(self.fc2.weight)
else: model = None save_file = open("data/proc/{}.pkl".format(dataset_name), "wb") pickle.dump(dataset, save_file) return model if __name__ == "__main__": character_dict = {} train_model = proc_file("train", word2vec=True) proc_file("val") proc_file("test") word_num = len(character_dict) word_vec_dim = 128 word_vec_matrix = torch.zeros((word_num, word_vec_dim)) init.xavier_normal_(word_vec_matrix) for word in character_dict: cur_id = character_dict[word] if word in train_model.wv: cur_vec = train_model.wv[word] word_vec_matrix[cur_id] = torch.Tensor(cur_vec) torch.save(word_vec_matrix, "data/proc/matrix.torch") dict_file = open("data/proc/dict.pkl", "wb") pickle.dump(character_dict, dict_file)
def __init__(self, d_in, d_out, bias=True): super(Linear, self).__init__() self.linear = nn.Linear(d_in, d_out, bias=bias) init.xavier_normal_(self.linear.weight)
def init(self): xavier_normal_(self.emb_e_real.weight.data) xavier_normal_(self.emb_e_img.weight.data) xavier_normal_(self.emb_rel_real.weight.data) xavier_normal_(self.emb_rel_img.weight.data)
def init(self): xavier_normal_(self.Er.weight.data) xavier_normal_(self.Rr.weight.data) xavier_normal_(self.Ei.weight.data) xavier_normal_(self.Ri.weight.data)
def init_fc(self): for name, param in self.fc.named_parameters(): if 'weight' in name: init.xavier_normal_(param) if 'bias' in name: init.constant_(param, 0)
def init(self): xavier_normal_(self.emb_e.weight.data) xavier_normal_(self.emb_rel.weight.data) xavier_normal_(self.gc1.weight.data) xavier_normal_(self.gc2.weight.data)
def glorot_init(params): for p in params: if len(p.data.size()) > 1: init.xavier_normal_(p.data)
def __init__(self): super(Net_PointRR_v2, self).__init__() #unused parameters self.patch_num = args.patch_num self.dim_k = args.dim_k self.cycle = args.cycle self.delta = args.delta self.learn_delta = args.learn_delta #basic settings self.emb_dims = args.emb_dims self.top_k = 1024 self.state_dim = 1024 ######################## LAYERS ######################### ###### The lieAlgebra compution self.exp = LieAlgebra.se3.Exp # [B, 6] -> [B, 4, 4] self.rigid_transform = LieAlgebra.se3.transform # [B, 1, 4, 4] x [B, N, 3] -> [B, N, 3] ###### End part ###### The layers for the source point cloud self.emb_nn_source = DGCNN(emb_dims=self.emb_dims) ###### End part ###### The layers for rigid registration ### 1. Feature extraction self.emb_nn_rigid = DGCNN(emb_dims=self.emb_dims) ### 2. Transformer self.pointer_rigid = Transformer(args=args) ### 3. Rotation and translation mlp_rt_rigid = [512, 512, 512, 256, 128, 64] self.rt_mlp_rigid = MLPNet(self.top_k, mlp_rt_rigid, b_shared=True).layers self.rt_rigid = torch.nn.Conv1d(64, 6, 1) init.xavier_normal_(self.rt_rigid.weight, gain=1.0) ### 4. Init the state H self.emb_nn_source_state = DGCNN(emb_dims=self.state_dim) ###### End part ###### The layers for non-rigid registration ### 1. Feature extraction self.emb_nn = DGCNN(emb_dims=self.emb_dims) ### 2. Transformer self.pointer = Transformer(args=args) ### 3. point-wise weight mlp_w = [512, 256, 256, 256] self.point_wise_weight_mlp = MLPNet(self.state_dim, mlp_w, b_shared=True).layers mlp_w_2 = [64] self.point_wise_weight_mlp_2 = MLPNet(256, mlp_w_2, b_shared=True).layers self.point_wise_weight = torch.nn.Conv1d(64, 1, 1) init.xavier_normal_(self.point_wise_weight.weight, gain=1.0) ### 3. Rotation and translation mlp_rt = [512, 256] self.rt_mlp = MLPNet(self.state_dim, mlp_rt, b_shared=True).layers mlp_rt_2 = [256, 128, 64] self.rt_mlp_2 = MLPNet(512, mlp_rt_2, b_shared=True).layers mlp_rt_3 = [64, 64, 64] self.rt_mlp_3 = MLPNet(64, mlp_rt_3, b_shared=True).layers self.rt = torch.nn.Conv1d(64, 6, 1) init.xavier_normal_(self.rt.weight, gain=1.0) ###### The layers of GRU ### 1. Acquire the Z self.points_mlp_z = torch.nn.Conv1d( 4 * self.emb_dims + self.top_k + self.state_dim, self.state_dim, 1) init.xavier_normal_(self.points_mlp_z.weight, gain=1.0) ### 2. Acquire the R self.points_mlp_r = torch.nn.Conv1d( 4 * self.emb_dims + self.top_k + self.state_dim, self.state_dim, 1) init.xavier_normal_(self.points_mlp_r.weight, gain=1.0) ### 2. Acquire the H-wave self.points_mlp_hwave = torch.nn.Conv1d( 4 * self.emb_dims + self.state_dim + self.top_k, self.state_dim, 1) init.xavier_normal_(self.points_mlp_hwave.weight, gain=1.0) ###### Others self.sigmoid = torch.nn.Sigmoid()
def init_weight(self): init.xavier_normal_(self.gru.weight_hh_l0) init.xavier_normal_(self.gru.weight_ih_l0) self.gru.bias_ih_l0.data.fill_(0.0) self.gru.bias_hh_l0.data.fill_(0.0)
initial_state_dict = copy.deepcopy(model.state_dict()) utils.checkdir(f'{os.getcwd()}/saves/{args.arch_type}/{args.dataset}/') torch.save( model, f'{os.getcwd()}/saves/{args.arch_type}/{args.dataset}/initial_state_dict.pth.tar' ) # Making Initial Mask mask = [] score = [] for name, p in model.named_parameters(): if 'weight' in name: tensor = p.data.cpu().numpy() mask.append(np.ones_like(tensor)) if p.data.dim() > 1: score.append(init.xavier_normal_(torch.ones_like(p.data))) else: score.append( init.normal_(torch.ones_like(p.data), mean=1, std=0.02)) optimizer = torch.optim.Adam(model.parameters(), weight_decay=1e-4) criterion = nn.CrossEntropyLoss() comp_ratio = [] bestacc = [] for i in range(args.prune_ite): if args.mini_batch: loss, acc, comp_ratio, bestacc = train(model, train_loader, optimizer, criterion, mask, score) comp1 = utils.print_nonzeros(model)
def __init__(self, input_dim, n_hidden, n_layer, dropout, n_hop, side_dim, stop, hard_attention=False): super().__init__() self._init_h = nn.Parameter(torch.Tensor(n_layer, n_hidden)) self._init_c = nn.Parameter(torch.Tensor(n_layer, n_hidden)) self._init_i = nn.Parameter(torch.Tensor(input_dim)) init.uniform_(self._init_h, -INI, INI) init.uniform_(self._init_c, -INI, INI) init.uniform_(self._init_i, -0.1, 0.1) self._lstm = nn.LSTM( input_dim, n_hidden, n_layer, bidirectional=False, dropout=dropout ) self._lstm_cell = None # attention parameters self._attn_wm = nn.Parameter(torch.Tensor(input_dim, n_hidden)) self._attn_wq = nn.Parameter(torch.Tensor(n_hidden, n_hidden)) self._attn_v = nn.Parameter(torch.Tensor(n_hidden)) init.xavier_normal_(self._attn_wm) init.xavier_normal_(self._attn_wq) init.uniform_(self._attn_v, -INI, INI) # hop parameters self._hop_wm = nn.Parameter(torch.Tensor(input_dim, n_hidden)) self._hop_wq = nn.Parameter(torch.Tensor(n_hidden, n_hidden)) self._hop_v = nn.Parameter(torch.Tensor(n_hidden)) init.xavier_normal_(self._hop_wm) init.xavier_normal_(self._hop_wq) init.uniform_(self._hop_v, -INI, INI) self._n_hop = n_hop # side info attention if not hard_attention: self.side_wm = nn.Parameter(torch.Tensor(side_dim, n_hidden)) self.side_wq = nn.Parameter(torch.Tensor(n_hidden, n_hidden)) self.side_v = nn.Parameter(torch.Tensor(n_hidden)) init.xavier_normal_(self.side_wm) init.xavier_normal_(self.side_wq) init.uniform_(self.side_v, -INI, INI) else: self.side_wq = nn.Parameter(torch.Tensor(n_hidden, 1)) self.side_wbi = nn.Bilinear(side_dim, side_dim, 1) init.xavier_normal_(self.side_wq) self._start = nn.Parameter(torch.Tensor(side_dim)) init.uniform_(self._start) if not hard_attention: self._attn_ws = nn.Parameter(torch.Tensor(n_hidden, n_hidden)) init.xavier_normal_(self._attn_ws) else: self._attn_ws = nn.Parameter(torch.Tensor(side_dim, n_hidden)) init.xavier_normal_(self._attn_ws) # pad entity self._pad_entity = nn.Parameter(torch.Tensor(side_dim)) init.uniform_(self._pad_entity) # eos entity if hard_attention: self._eos_entity = nn.Parameter(torch.Tensor(side_dim)) init.uniform_(self._eos_entity) # stop token if stop: self._stop = nn.Parameter(torch.Tensor(input_dim)) init.uniform_(self._stop, -INI, INI) self.stop = stop self._hard_attention = hard_attention if self._hard_attention: self.side_dim = side_dim
def weight_init(m): if isinstance(m, nn.Conv1d): init.normal_(m.weight.data) if m.bias is not None: init.normal_(m.bias.data) elif isinstance(m, nn.Conv2d): init.xavier_normal_(m.weight.data) if m.bias is not None: init.normal_(m.bias.data) elif isinstance(m, nn.Conv3d): init.xavier_normal_(m.weight.data) if m.bias is not None: init.normal_(m.bias.data) elif isinstance(m, nn.ConvTranspose1d): init.normal_(m.weight.data) if m.bias is not None: init.normal_(m.bias.data) elif isinstance(m, nn.ConvTranspose2d): init.xavier_normal_(m.weight.data) if m.bias is not None: init.normal_(m.bias.data) elif isinstance(m, nn.ConvTranspose3d): init.xavier_normal_(m.weight.data) if m.bias is not None: init.normal_(m.bias.data) elif isinstance(m, nn.BatchNorm1d): init.normal_(m.weight.data, mean=1, std=0.02) init.constant_(m.bias.data, 0) elif isinstance(m, nn.BatchNorm2d): if hasattr(m.weight, 'data'): init.normal_(m.weight.data, mean=1, std=0.02) if hasattr(m.bias, 'data'): init.constant_(m.bias.data, 0) elif isinstance(m, nn.BatchNorm3d): init.normal_(m.weight.data, mean=1, std=0.02) init.constant_(m.bias.data, 0) elif isinstance(m, nn.Linear): init.xavier_normal_(m.weight.data) init.normal_(m.bias.data) elif isinstance(m, nn.LSTM): for param in m.parameters(): if len(param.shape) >= 2: init.orthogonal_(param.data) else: init.normal_(param.data) elif isinstance(m, nn.LSTMCell): for param in m.parameters(): if len(param.shape) >= 2: init.orthogonal_(param.data) else: init.normal_(param.data) elif isinstance(m, nn.GRU): for param in m.parameters(): if len(param.shape) >= 2: init.orthogonal_(param.data) else: init.normal_(param.data) elif isinstance(m, nn.GRUCell): for param in m.parameters(): if len(param.shape) >= 2: init.orthogonal_(param.data) else: init.normal_(param.data)
def init(self): xavier_normal_(self.R.weight.data)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print('loading dataset') if opt.init is not None: init_pose_np = np.load(opt.init) init_pose = torch.from_numpy(init_pose_np) else: init_pose = None dataset = SimulatedPointCloud(opt.data_dir, init_pose) loader = DataLoader(dataset, batch_size=opt.batch_size, shuffle=False) latent_vecs = [] mask_vecs_pair = [] for i in range(len(dataset)): vec = tini.xavier_normal_(torch.ones((1, opt.num_lat))).to(device) vec = torch.nn.Parameter(vec) #True latent_vecs.append(vec) mask_vec = torch.ones(dataset.point_clouds.size()[1]) mask_vecs_pair.append(mask_vec) loss_fn = eval('loss.' + opt.loss) print('creating model') model = DeepMapping2D(loss_fn=loss_fn, n_obs=dataset.n_obs, n_samples=opt.n_samples).to(device) optimizer = optim.Adam(model.parameters(), lr=opt.lr) if opt.model is not None:
def weight_init(m): ''' Usage: model = Model() model.apply(weight_init) ''' if isinstance(m, nn.Conv1d): init.normal_(m.weight.data) if m.bias is not None: init.normal_(m.bias.data) elif isinstance(m, nn.Conv2d): init.xavier_normal_(m.weight.data) if m.bias is not None: init.normal_(m.bias.data) elif isinstance(m, nn.Conv3d): init.xavier_normal_(m.weight.data) if m.bias is not None: init.normal_(m.bias.data) elif isinstance(m, nn.ConvTranspose1d): init.normal_(m.weight.data) if m.bias is not None: init.normal_(m.bias.data) elif isinstance(m, nn.ConvTranspose2d): init.xavier_normal_(m.weight.data) if m.bias is not None: init.normal_(m.bias.data) elif isinstance(m, nn.ConvTranspose3d): init.xavier_normal_(m.weight.data) if m.bias is not None: init.normal_(m.bias.data) elif isinstance(m, nn.BatchNorm1d): init.normal_(m.weight.data, mean=1, std=0.02) init.constant_(m.bias.data, 0) elif isinstance(m, nn.BatchNorm2d): init.normal_(m.weight.data, mean=1, std=0.02) init.constant_(m.bias.data, 0) elif isinstance(m, nn.BatchNorm3d): init.normal_(m.weight.data, mean=1, std=0.02) init.constant_(m.bias.data, 0) elif isinstance(m, nn.Linear): init.xavier_normal_(m.weight.data) init.normal_(m.bias.data) elif isinstance(m, nn.LSTM): for param in m.parameters(): if len(param.shape) >= 2: init.orthogonal_(param.data) else: init.normal_(param.data) elif isinstance(m, nn.LSTMCell): for param in m.parameters(): if len(param.shape) >= 2: init.orthogonal_(param.data) else: init.normal_(param.data) elif isinstance(m, nn.GRU): for param in m.parameters(): if len(param.shape) >= 2: init.orthogonal_(param.data) else: init.normal_(param.data) elif isinstance(m, nn.GRUCell): for param in m.parameters(): if len(param.shape) >= 2: init.orthogonal_(param.data) else: init.normal_(param.data) elif isinstance(m, nn.Embedding): init.torch.nn.init.uniform_(m.weight.data, -0.08, 0.08)
def __init__(self, n_head, d_model, d_k, d_v, residual_dropout=0.1, attention_dropout=0.1, d_positional=None): super(MultiHeadAttention, self).__init__() self.n_head = n_head self.d_k = d_k self.d_v = d_v if d_positional is None: self.partitioned = False else: self.partitioned = True if self.partitioned: self.d_content = d_model - d_positional self.d_positional = d_positional self.w_qs1 = nn.Parameter( torch_t.FloatTensor(n_head, self.d_content, d_k // 2)) self.w_ks1 = nn.Parameter( torch_t.FloatTensor(n_head, self.d_content, d_k // 2)) self.w_vs1 = nn.Parameter( torch_t.FloatTensor(n_head, self.d_content, d_v // 2)) self.w_qs2 = nn.Parameter( torch_t.FloatTensor(n_head, self.d_positional, d_k // 2)) self.w_ks2 = nn.Parameter( torch_t.FloatTensor(n_head, self.d_positional, d_k // 2)) self.w_vs2 = nn.Parameter( torch_t.FloatTensor(n_head, self.d_positional, d_v // 2)) init.xavier_normal_(self.w_qs1) init.xavier_normal_(self.w_ks1) init.xavier_normal_(self.w_vs1) init.xavier_normal_(self.w_qs2) init.xavier_normal_(self.w_ks2) init.xavier_normal_(self.w_vs2) else: self.w_qs = nn.Parameter(torch_t.FloatTensor(n_head, d_model, d_k)) self.w_ks = nn.Parameter(torch_t.FloatTensor(n_head, d_model, d_k)) self.w_vs = nn.Parameter(torch_t.FloatTensor(n_head, d_model, d_v)) init.xavier_normal_(self.w_qs) init.xavier_normal_(self.w_ks) init.xavier_normal_(self.w_vs) self.attention = ScaledDotProductAttention( d_model, attention_dropout=attention_dropout) self.layer_norm = LayerNormalization(d_model) if not self.partitioned: # The lack of a bias term here is consistent with the t2t code, though # in my experiments I have never observed this making a difference. self.proj = nn.Linear(n_head * d_v, d_model, bias=False) else: self.proj1 = nn.Linear(n_head * (d_v // 2), self.d_content, bias=False) self.proj2 = nn.Linear(n_head * (d_v // 2), self.d_positional, bias=False) self.residual_dropout = FeatureDropout(residual_dropout)
def re_init_head(self): inits.xavier_normal_(self.cls_classifier.weight) print('clustering head re-initialized') return
def weight_init(m): """ Usage: model = Model() model.apply(weight_init) """ if isinstance(m, nn.Linear): init_pytorch_defaults(m, version="041") elif isinstance(m, nn.Conv2d): init_pytorch_defaults(m, version="041") elif isinstance(m, nn.BatchNorm1d): init_pytorch_defaults(m, version="041") elif isinstance(m, nn.BatchNorm2d): init_pytorch_defaults(m, version="041") elif isinstance(m, nn.Conv1d): init.normal_(m.weight.data) if m.bias is not None: init.normal_(m.bias.data) elif isinstance(m, nn.Conv3d): init.xavier_normal_(m.weight.data) if m.bias is not None: init.normal_(m.bias.data) elif isinstance(m, nn.ConvTranspose1d): init.normal_(m.weight.data) if m.bias is not None: init.normal_(m.bias.data) elif isinstance(m, nn.ConvTranspose2d): init.xavier_normal_(m.weight.data) if m.bias is not None: init.normal_(m.bias.data) elif isinstance(m, nn.ConvTranspose3d): init.xavier_normal_(m.weight.data) if m.bias is not None: init.normal_(m.bias.data) elif isinstance(m, nn.BatchNorm3d): init.normal_(m.weight.data, mean=1, std=0.02) init.constant_(m.bias.data, 0) elif isinstance(m, nn.LSTM): for param in m.parameters(): if len(param.shape) >= 2: init.orthogonal_(param.data) else: init.normal_(param.data) elif isinstance(m, nn.LSTMCell): for param in m.parameters(): if len(param.shape) >= 2: init.orthogonal_(param.data) else: init.normal_(param.data) elif isinstance(m, nn.GRU): for param in m.parameters(): if len(param.shape) >= 2: init.orthogonal_(param.data) else: init.normal_(param.data) elif isinstance(m, nn.GRUCell): for param in m.parameters(): if len(param.shape) >= 2: init.orthogonal_(param.data) else: init.normal_(param.data)
def weight_init(m): if isinstance(m, nn.Conv2d): init.xavier_normal_(m.weight) init.constant_(m.bias, 0)
def _weights_init(m): classname = m.__class__.__name__ #print(classname) if isinstance(m, nn.Linear) or isinstance(m, nn.Conv2d): init.xavier_normal_(m.weight)