def init_func(m): classname = m.__class__.__name__ if hasattr(m, 'weight') and (classname.find('Conv') != -1 or classname.find('Linear') != -1): if init_type == 'normal': init.normal_(m.weight.data, 0.0, gain) elif init_type == 'xavier': init.xavier_normal_(m.weight.data, gain=gain) elif init_type == 'kaiming': init.kaiming_normal_(m.weight.data, a=0, mode='fan_in') elif init_type == 'orthogonal': init.orthogonal_(m.weight.data, gain=gain) else: raise NotImplementedError('initialization method [%s] is not implemented' % init_type) if hasattr(m, 'bias') and m.bias is not None: init.constant_(m.bias.data, 0.0) elif classname.find('BatchNorm2d') != -1: init.normal_(m.weight.data, 1.0, gain) init.constant_(m.bias.data, 0.0)
def init_weights(self): for w in self.rnn.parameters(): # initialize the gate weights with orthogonal if w.dim()>1: weight_init.orthogonal_(w)
def _initialize_gru(self): for param in self.gru.parameters(): if len(param.shape) >= 2: init.orthogonal_(param.data) else: init.normal_(param.data)
def init_weight(self): init.orthogonal_(self.gru.weight_hh_l0) init.orthogonal_(self.gru.weight_ih_l0)
def __init__(self, hps, *_): super(BiLSTMTagger, self).__init__() batch_size = hps['batch_size'] lstm_hidden_dim = hps['sent_hdim'] sent_embedding_dim = 3*hps['sent_edim'] + 1*hps['pos_edim'] ## for the region mark sent_embedding_dim += 1 role_embedding_dim = hps['role_edim'] frame_embedding_dim = role_embedding_dim vocab_size = hps['vword'] self.tagset_size = hps['vbio'] self.pos_size = hps['vpos'] self.dep_size = hps['vdep'] self.frameset_size = hps['vframe'] self.num_layers = hps['rec_layers'] self.batch_size = batch_size self.hidden_dim = lstm_hidden_dim self.word_emb_dim = hps['sent_edim'] self.word_embeddings = nn.Embedding(vocab_size, hps['sent_edim']) self.pos_embeddings = nn.Embedding(self.pos_size, hps['pos_edim']) self.dep_embeddings = nn.Embedding(self.dep_size, hps['pos_edim']) self.p_lemma_embeddings = nn.Embedding(self.frameset_size, hps['sent_edim']) #self.lr_dep_embeddings = nn.Embedding(self.lr_dep_size, hps[]) self.word_fixed_embeddings = nn.Embedding(vocab_size, hps['sent_edim']) self.word_fixed_embeddings.weight.data.copy_(torch.from_numpy(hps['word_embeddings'])) self.role_embeddings = nn.Embedding(self.tagset_size, role_embedding_dim) self.frame_embeddings = nn.Embedding(self.frameset_size, frame_embedding_dim) self.hidden2tag_M = nn.Linear(2*lstm_hidden_dim, 2*lstm_hidden_dim) self.hidden2tag_M_copy = nn.Linear(2*lstm_hidden_dim, 2*lstm_hidden_dim) self.hidden2tag_H = nn.Linear(2*lstm_hidden_dim, 2*lstm_hidden_dim) self.MLP = nn.Linear(2*lstm_hidden_dim, self.dep_size) # The LSTM takes word embeddings as inputs, and outputs hidden states # with dimensionality hidden_dim. self.num_layers = 2 self.BiLSTM_share = nn.LSTM(input_size=sent_embedding_dim, hidden_size=lstm_hidden_dim, batch_first=True, bidirectional=True, num_layers=self.num_layers) init.orthogonal_(self.BiLSTM_share.all_weights[0][0]) init.orthogonal_(self.BiLSTM_share.all_weights[0][1]) init.orthogonal_(self.BiLSTM_share.all_weights[1][0]) init.orthogonal_(self.BiLSTM_share.all_weights[1][1]) self.num_layers = 2 self.BiLSTM_SRL = nn.LSTM(input_size=lstm_hidden_dim * 2, hidden_size=lstm_hidden_dim, batch_first=True, bidirectional=True, num_layers=self.num_layers) init.orthogonal_(self.BiLSTM_SRL.all_weights[0][0]) init.orthogonal_(self.BiLSTM_SRL.all_weights[0][1]) init.orthogonal_(self.BiLSTM_SRL.all_weights[1][0]) init.orthogonal_(self.BiLSTM_SRL.all_weights[1][1]) # non-linear map to role embedding self.role_map = nn.Linear(in_features=role_embedding_dim * 2, out_features=self.hidden_dim * 4) # Init hidden state self.hidden = self.init_hidden() self.hidden_2 = self.init_hidden() self.hidden_3 = self.init_hidden() self.hidden_4 = self.init_hidden()
def init_ortho(module): for weight_ in module.parameters(): if len(weight_.size()) == 2: init.orthogonal_(weight_)
def __init__(self, hps, *_): super(BiLSTMTagger, self).__init__() batch_size = hps['batch_size'] lstm_hidden_dim = hps['sent_hdim'] sent_embedding_dim_DEP = 2 * hps['sent_edim'] sent_embedding_dim_SRL = 2 * hps['sent_edim'] + 16 ## for the region mark role_embedding_dim = hps['role_edim'] frame_embedding_dim = role_embedding_dim vocab_size = hps['vword'] self.tagset_size = hps['vbio'] self.pos_size = hps['vpos'] self.dep_size = hps['vdep'] self.frameset_size = hps['vframe'] self.num_layers = hps['rec_layers'] self.batch_size = batch_size self.hidden_dim = lstm_hidden_dim self.word_emb_dim = hps['sent_edim'] self.specific_dep_size = hps['svdep'] self.SRL_input_dropout = nn.Dropout(p=0.3) self.DEP_input_dropout = nn.Dropout(p=0.3) self.SRL_hidden_dropout = nn.Dropout(p=0.3) self.DEP_hidden_dropout_1 = nn.Dropout(p=0.3) self.DEP_hidden_dropout_2 = nn.Dropout(p=0.3) self.SRL_proj_word_dropout = nn.Dropout(p=0.3) self.SRL_proj_predicate_dropout = nn.Dropout(p=0.3) self.DEP_proj_word_dropout = nn.Dropout(p=0.3) self.DEP_proj_predicate_dropout = nn.Dropout(p=0.3) self.Idenficiation_dropout = nn.Dropout(p=0.3) #self.use_dropout = nn.Dropout(p=0.2) # The BiLSTM encoder # The LSTM takes word embeddings as inputs, and outputs hidden states self.num_layers = 1 self.word_embeddings_DEP = nn.Embedding(vocab_size, hps['sent_edim']) self.word_fixed_embeddings_DEP = nn.Embedding(vocab_size, hps['sent_edim']) self.word_fixed_embeddings_DEP.weight.data.copy_( torch.from_numpy(hps['word_embeddings'])) self.BiLSTM_0 = nn.LSTM(input_size=sent_embedding_dim_DEP, hidden_size=lstm_hidden_dim, batch_first=True, bidirectional=True, num_layers=self.num_layers) init.orthogonal_(self.BiLSTM_0.all_weights[0][0]) init.orthogonal_(self.BiLSTM_0.all_weights[0][1]) init.orthogonal_(self.BiLSTM_0.all_weights[1][0]) init.orthogonal_(self.BiLSTM_0.all_weights[1][1]) self.num_layers = 1 self.BiLSTM_1 = nn.LSTM(input_size=lstm_hidden_dim * 2, hidden_size=lstm_hidden_dim, batch_first=True, bidirectional=True, num_layers=self.num_layers) init.orthogonal_(self.BiLSTM_1.all_weights[0][0]) init.orthogonal_(self.BiLSTM_1.all_weights[0][1]) init.orthogonal_(self.BiLSTM_1.all_weights[1][0]) init.orthogonal_(self.BiLSTM_1.all_weights[1][1]) # SRL: primary prediciton self.num_layers = 3 self.word_embeddings_SRL = nn.Embedding(vocab_size, hps['sent_edim']) self.word_fixed_embeddings_SRL = nn.Embedding(vocab_size, hps['sent_edim']) self.word_fixed_embeddings_SRL.weight.data.copy_( torch.from_numpy(hps['word_embeddings'])) self.dep_embeddings = nn.Embedding(self.dep_size, self.pos_size) self.region_embeddings = nn.Embedding(2, 16) self.elmo_emb_size = 200 #L + self.elmo_emb_size * 1 + 1 * self.pos_size self.BiLSTM_SRL = nn.LSTM(input_size=sent_embedding_dim_SRL, hidden_size=lstm_hidden_dim, batch_first=True, bidirectional=True, num_layers=self.num_layers) init.orthogonal_(self.BiLSTM_SRL.all_weights[0][0]) init.orthogonal_(self.BiLSTM_SRL.all_weights[0][1]) init.orthogonal_(self.BiLSTM_SRL.all_weights[1][0]) init.orthogonal_(self.BiLSTM_SRL.all_weights[1][1]) self.elmo_mlp = nn.Sequential( nn.Linear(2 * lstm_hidden_dim, self.elmo_emb_size), nn.ReLU()) self.elmo_w = nn.Parameter(torch.Tensor([0.5, 0.5])) self.elmo_gamma = nn.Parameter(torch.ones(1)) self.W_R = nn.Parameter( torch.rand(lstm_hidden_dim + 1, self.tagset_size * (lstm_hidden_dim + 1))) #self.W_share = nn.Parameter(torch.rand(lstm_hidden_dim, lstm_hidden_dim)) self.Non_Predicate_Proj = nn.Linear(2 * lstm_hidden_dim, lstm_hidden_dim) self.Predicate_Proj = nn.Linear(2 * lstm_hidden_dim, lstm_hidden_dim) self.cvt_hidden_dim = 200 ## SRL: auxiliary prediction: fwd-fwd self.Non_Predicate_Proj_FF = nn.Linear(lstm_hidden_dim, self.cvt_hidden_dim) self.Predicate_Proj_FF = nn.Linear(lstm_hidden_dim, self.cvt_hidden_dim) self.W_R_FF = nn.Parameter( torch.rand(self.cvt_hidden_dim + 1, self.tagset_size * self.cvt_hidden_dim)) ## SRL: auxiliary prediction: bwd-bwd self.Non_Predicate_Proj_BB = nn.Linear(lstm_hidden_dim, self.cvt_hidden_dim) self.Predicate_Proj_BB = nn.Linear(lstm_hidden_dim, self.cvt_hidden_dim) self.W_R_BB = nn.Parameter( torch.rand(self.cvt_hidden_dim + 1, self.tagset_size * self.cvt_hidden_dim)) ## SRL: auxiliary prediction: fwd-bwd self.Non_Predicate_Proj_FB = nn.Linear(lstm_hidden_dim, self.cvt_hidden_dim) self.Predicate_Proj_FB = nn.Linear(lstm_hidden_dim, self.cvt_hidden_dim) self.W_R_FB = nn.Parameter( torch.rand(self.cvt_hidden_dim + 1, self.tagset_size * self.cvt_hidden_dim)) ## SRL: auxiliary prediction: bwd-fwd self.Non_Predicate_Proj_BF = nn.Linear(lstm_hidden_dim, self.cvt_hidden_dim) self.Predicate_Proj_BF = nn.Linear(lstm_hidden_dim, self.cvt_hidden_dim) self.W_R_BF = nn.Parameter( torch.rand(self.cvt_hidden_dim + 1, self.tagset_size * self.cvt_hidden_dim)) # Dependency extractor: primary preidition self.hidden2tag_1 = nn.Linear(4 * lstm_hidden_dim, lstm_hidden_dim) self.hidden2tag_2 = nn.Linear(4 * lstm_hidden_dim, lstm_hidden_dim) self.W_dep = nn.Parameter( torch.rand(lstm_hidden_dim + 1, self.specific_dep_size * lstm_hidden_dim)) self.tag2hidden = nn.Linear(self.specific_dep_size, self.pos_size, bias=False) # Dependency extractor: auxiliary FF self.hidden2tag_1_FF = nn.Linear(lstm_hidden_dim, lstm_hidden_dim) self.hidden2tag_2_FF = nn.Linear(lstm_hidden_dim, lstm_hidden_dim) self.MLP_FF = nn.Linear(2 * lstm_hidden_dim, 2 * lstm_hidden_dim) self.MLP_FF_2 = nn.Linear(2 * lstm_hidden_dim, self.specific_dep_size) # Dependency extractor: auxiliary BB self.hidden2tag_1_BB = nn.Linear(lstm_hidden_dim, lstm_hidden_dim) self.hidden2tag_2_BB = nn.Linear(lstm_hidden_dim, lstm_hidden_dim) self.MLP_BB = nn.Linear(2 * lstm_hidden_dim, 2 * lstm_hidden_dim) self.MLP_BB_2 = nn.Linear(2 * lstm_hidden_dim, self.specific_dep_size) # Dependency extractor: auxiliary FB self.hidden2tag_1_FB = nn.Linear(lstm_hidden_dim, lstm_hidden_dim) self.hidden2tag_2_FB = nn.Linear(lstm_hidden_dim, lstm_hidden_dim) self.MLP_FB = nn.Linear(2 * lstm_hidden_dim, 2 * lstm_hidden_dim) self.MLP_FB_2 = nn.Linear(2 * lstm_hidden_dim, self.specific_dep_size) # Dependency extractor: auxiliary BF self.hidden2tag_1_BF = nn.Linear(lstm_hidden_dim, lstm_hidden_dim) self.hidden2tag_2_BF = nn.Linear(lstm_hidden_dim, lstm_hidden_dim) self.MLP_BF = nn.Linear(2 * lstm_hidden_dim, 2 * lstm_hidden_dim) self.MLP_BF_2 = nn.Linear(2 * lstm_hidden_dim, self.specific_dep_size) # Predicate identification self.MLP_identification = nn.Linear(4 * lstm_hidden_dim, 2 * lstm_hidden_dim) self.Idenficiation = nn.Linear(2 * lstm_hidden_dim, 3) # Init hidden state self.hidden = self.init_hidden_spe() self.hidden_2 = self.init_hidden_spe() self.hidden_3 = self.init_hidden_spe() self.hidden_4 = self.init_hidden_share()
def _initialize_weights(self): init.orthogonal_(self.conv2d.weight, init.calculate_gain('relu'))
def __init__(self, input_size, hidden_size, rnn_type='lstm', num_layers=1, num_hidden_layers=2, bias=True, batch_first=True, dropout=0, bidirectional=False, nr_cells=5, read_heads=2, cell_size=10, nonlinearity='tanh', gpu_id=-1, independent_linears=False, share_memory=True, debug=False, clip=20): super(DNC, self).__init__() # todo: separate weights and RNNs for the interface and output vectors self.input_size = input_size self.hidden_size = hidden_size self.rnn_type = rnn_type self.num_layers = num_layers self.num_hidden_layers = num_hidden_layers self.bias = bias self.batch_first = batch_first self.dropout = dropout self.bidirectional = bidirectional self.nr_cells = nr_cells self.read_heads = read_heads self.cell_size = cell_size self.nonlinearity = nonlinearity self.gpu_id = gpu_id self.independent_linears = independent_linears self.share_memory = share_memory self.debug = debug self.clip = clip self.w = self.cell_size self.r = self.read_heads self.read_vectors_size = self.r * self.w * 3 self.output_size = self.hidden_size self.nn_input_size = self.input_size + self.read_vectors_size self.nn_output_size = self.output_size + self.read_vectors_size self.rnns = [] self.memories = [] for layer in range(self.num_layers): if self.rnn_type.lower() == 'rnn': self.rnns.append( nn.RNN((self.nn_input_size if layer == 0 else self.nn_output_size), self.output_size, bias=self.bias, nonlinearity=self.nonlinearity, batch_first=True, dropout=self.dropout, num_layers=self.num_hidden_layers)) elif self.rnn_type.lower() == 'gru': self.rnns.append( nn.GRU((self.nn_input_size if layer == 0 else self.nn_output_size), self.output_size, bias=self.bias, batch_first=True, dropout=self.dropout, num_layers=self.num_hidden_layers)) if self.rnn_type.lower() == 'lstm': self.rnns.append( nn.LSTM((self.nn_input_size if layer == 0 else self.nn_output_size), self.output_size, bias=self.bias, batch_first=True, dropout=self.dropout, num_layers=self.num_hidden_layers)) setattr(self, self.rnn_type.lower() + '_layer_' + str(layer), self.rnns[layer]) # memories for each layer if not self.share_memory: self.memories.append( Memory(input_size=self.output_size, mem_size=self.nr_cells, cell_size=self.w, read_heads=self.r, gpu_id=self.gpu_id, independent_linears=self.independent_linears)) setattr(self, 'rnn_layer_memory_' + str(layer), self.memories[layer]) # only one memory shared by all layers if self.share_memory: self.memories.append( Memory(input_size=self.output_size, mem_size=self.nr_cells, cell_size=self.w, read_heads=self.r, gpu_id=self.gpu_id, independent_linears=self.independent_linears)) setattr(self, 'rnn_layer_memory_shared', self.memories[0]) # final output layer self.output = nn.Linear(self.nn_output_size, self.output_size) orthogonal_(self.output.weight) if self.gpu_id != -1: [x.cuda(self.gpu_id) for x in self.rnns] [x.cuda(self.gpu_id) for x in self.memories] self.output.cuda()
def weight_init(m): ''' Snippet stolen from https://gist.github.com/jeasinema/ed9236ce743c8efaf30fa2ff732749f5 Usage: model = Model() model.apply(weight_init) ''' if isinstance(m, nn.Conv1d): init.normal_(m.weight.data) if m.bias is not None: init.normal_(m.bias.data) elif isinstance(m, nn.Conv2d): init.xavier_normal_(m.weight.data) if m.bias is not None: init.normal_(m.bias.data) elif isinstance(m, nn.Conv3d): init.xavier_normal_(m.weight.data) if m.bias is not None: init.normal_(m.bias.data) elif isinstance(m, nn.ConvTranspose1d): init.normal_(m.weight.data) if m.bias is not None: init.normal_(m.bias.data) elif isinstance(m, nn.ConvTranspose2d): init.xavier_normal_(m.weight.data) if m.bias is not None: init.normal_(m.bias.data) elif isinstance(m, nn.ConvTranspose3d): init.xavier_normal_(m.weight.data) if m.bias is not None: init.normal_(m.bias.data) elif isinstance(m, nn.BatchNorm1d): init.normal_(m.weight.data, mean=1, std=0.02) init.constant_(m.bias.data, 0) elif isinstance(m, nn.BatchNorm2d): init.normal_(m.weight.data, mean=1, std=0.02) init.constant_(m.bias.data, 0) elif isinstance(m, nn.BatchNorm3d): init.normal_(m.weight.data, mean=1, std=0.02) init.constant_(m.bias.data, 0) elif isinstance(m, nn.Linear): init.xavier_normal_(m.weight.data) if m.bias is not None: init.normal_(m.bias.data) elif isinstance(m, nn.LSTM): for param in m.parameters(): if len(param.shape) >= 2: init.orthogonal_(param.data) else: init.normal_(param.data) elif isinstance(m, nn.LSTMCell): for param in m.parameters(): if len(param.shape) >= 2: init.orthogonal_(param.data) else: init.normal_(param.data) elif isinstance(m, nn.GRU): for param in m.parameters(): if len(param.shape) >= 2: init.orthogonal_(param.data) else: init.normal_(param.data) elif isinstance(m, nn.GRUCell): for param in m.parameters(): if len(param.shape) >= 2: init.orthogonal_(param.data) else: init.normal_(param.data) else: pass
def weight_init(m): ''' Usage: model = Model() model.apply(weight_init) ''' if isinstance(m, nn.Conv1d): init.normal_(m.weight.data) if m.bias is not None: init.normal_(m.bias.data) elif isinstance(m, nn.Conv2d): # init.kaiming_uniform_(m.weight.data, a=0.2, mode='fan_in', nonlinearity='leaky_relu') # init.xavier_normal_(m.weight.data) # init.xavier_uniform_(m.weight.data, gain=1.0) torch.nn.init.kaiming_normal_(m.weight.data, a=0.2, mode='fan_in', nonlinearity='leaky_relu') if m.bias is not None: init.normal_(m.bias.data) elif isinstance(m, nn.Conv3d): # init.kaiming_uniform_(m.weight.data, a=0, mode='fan_in', nonlinearity='leaky_relu') # init.xavier_normal_(m.weight.data) torch.nn.init.kaiming_normal_(m.weight.data, a=0, mode='fan_in', nonlinearity='leaky_relu') if m.bias is not None: init.normal_(m.bias.data) elif isinstance(m, nn.ConvTranspose1d): init.normal_(m.weight.data) if m.bias is not None: init.normal_(m.bias.data) elif isinstance(m, nn.ConvTranspose2d): # init.kaiming_uniform_(m.weight.data, a=0, mode='fan_in', nonlinearity='leaky_relu') # init.xavier_normal_(m.weight.data) torch.nn.init.kaiming_normal_(m.weight.data, a=0, mode='fan_in', nonlinearity='leaky_relu') if m.bias is not None: init.normal_(m.bias.data) elif isinstance(m, nn.ConvTranspose3d): # init.kaiming_uniform_(m.weight.data, a=0, mode='fan_in', nonlinearity='leaky_relu') # init.xavier_normal_(m.weight.data) torch.nn.init.kaiming_normal_(m.weight.data, a=0, mode='fan_in', nonlinearity='leaky_relu') if m.bias is not None: init.normal_(m.bias.data) elif isinstance(m, nn.BatchNorm1d): init.normal_(m.weight.data, mean=1, std=0.02) init.constant_(m.bias.data, 0) elif isinstance(m, nn.BatchNorm2d): init.normal_(m.weight.data, mean=1, std=0.02) init.constant_(m.bias.data, 0) elif isinstance(m, nn.BatchNorm3d): init.normal_(m.weight.data, mean=1, std=0.02) init.constant_(m.bias.data, 0) elif isinstance(m, nn.Linear): init.xavier_normal_(m.weight.data) init.normal_(m.bias.data) elif isinstance(m, nn.LSTM): for param in m.parameters(): if len(param.shape) >= 2: init.orthogonal_(param.data) else: init.normal_(param.data) elif isinstance(m, nn.LSTMCell): for param in m.parameters(): if len(param.shape) >= 2: init.orthogonal_(param.data) else: init.normal_(param.data) elif isinstance(m, nn.GRU): for param in m.parameters(): if len(param.shape) >= 2: init.orthogonal_(param.data) else: init.normal_(param.data) elif isinstance(m, nn.GRUCell): for param in m.parameters(): if len(param.shape) >= 2: init.orthogonal_(param.data) else: init.normal_(param.data)
def weight_init(m): ''' Taken from: https://gist.github.com/jeasinema/ed9236ce743c8efaf30fa2ff732749f5 ''' if isinstance(m, nn.Conv1d): init.normal_(m.weight.data) if m.bias is not None: init.normal_(m.bias.data) elif isinstance(m, nn.Conv2d): init.xavier_normal_(m.weight.data) if m.bias is not None: init.normal_(m.bias.data) elif isinstance(m, nn.Conv3d): init.xavier_normal_(m.weight.data) if m.bias is not None: init.normal_(m.bias.data) elif isinstance(m, nn.ConvTranspose1d): init.normal_(m.weight.data) if m.bias is not None: init.normal_(m.bias.data) elif isinstance(m, nn.ConvTranspose2d): init.xavier_normal_(m.weight.data) if m.bias is not None: init.normal_(m.bias.data) elif isinstance(m, nn.ConvTranspose3d): init.xavier_normal_(m.weight.data) if m.bias is not None: init.normal_(m.bias.data) elif isinstance(m, nn.BatchNorm1d): init.normal_(m.weight.data, mean=1, std=0.02) init.constant_(m.bias.data, 0) elif isinstance(m, nn.BatchNorm2d): init.normal_(m.weight.data, mean=1, std=0.02) init.constant_(m.bias.data, 0) elif isinstance(m, nn.BatchNorm3d): init.normal_(m.weight.data, mean=1, std=0.02) init.constant_(m.bias.data, 0) elif isinstance(m, nn.Linear): init.xavier_normal_(m.weight.data) init.normal_(m.bias.data) elif isinstance(m, nn.LSTM): for param in m.parameters(): if len(param.shape) >= 2: init.orthogonal_(param.data) else: init.normal_(param.data) elif isinstance(m, nn.LSTMCell): for param in m.parameters(): if len(param.shape) >= 2: init.orthogonal_(param.data) else: init.normal_(param.data) elif isinstance(m, nn.GRU): for param in m.parameters(): if len(param.shape) >= 2: init.orthogonal_(param.data) else: init.normal_(param.data) elif isinstance(m, nn.GRUCell): for param in m.parameters(): if len(param.shape) >= 2: init.orthogonal_(param.data) else: init.normal_(param.data) elif isinstance(m, ( nn.Dropout, nn.ReLU, nn.ELU, nn.LeakyReLU, nn.Sigmoid, nn.Tanh, nn.MaxPool2d, nn.AvgPool2d, nn.InstanceNorm2d, nn.Embedding, )): pass elif len(m._modules) > 0: pass else: print("!! Warning: {} has no deafault initialization scheme".format( type(m)))
def init_weights(self): # LSTM Unit: numlayer = 1, initialization init.orthogonal_(self.gru_layer1.all_weights[0][0], gain=np.sqrt(2.0)) init.orthogonal_(self.gru_layer1.all_weights[0][1], gain=np.sqrt(2.0)) init.uniform_(self.gru_layer1.all_weights[0][2], 1, 0.1) init.uniform_(self.gru_layer1.all_weights[0][3], 1, 0.1) init.orthogonal_(self.gru_layer2.all_weights[0][0], gain=np.sqrt(2.0)) init.orthogonal_(self.gru_layer2.all_weights[0][1], gain=np.sqrt(2.0)) init.uniform_(self.gru_layer2.all_weights[0][2], 1, 0.1) init.uniform_(self.gru_layer2.all_weights[0][3], 1, 0.1) init.orthogonal_(self.gru_layer3.all_weights[0][0], gain=np.sqrt(2.0)) init.orthogonal_(self.gru_layer3.all_weights[0][1], gain=np.sqrt(2.0)) init.uniform_(self.gru_layer3.all_weights[0][2], 1, 0.1) init.uniform_(self.gru_layer3.all_weights[0][3], 1, 0.1)
def torch_weight_init(m): ''' Usage: model = Model() model.apply(weight_init) ''' if isinstance(m, nn.Conv1d): # init.normal_(m.weight.data) init.xavier_uniform_(m.weight.data) if m.bias is not None: # init.normal_(m.bias.data) init.constant_(m.bias.data, 0) elif isinstance(m, nn.Conv2d): init.xavier_normal_(m.weight.data) if m.bias is not None: init.normal_(m.bias.data) elif isinstance(m, nn.Conv3d): init.xavier_normal_(m.weight.data) if m.bias is not None: init.normal_(m.bias.data) elif isinstance(m, nn.ConvTranspose1d): init.normal_(m.weight.data) if m.bias is not None: init.normal_(m.bias.data) elif isinstance(m, nn.ConvTranspose2d): init.xavier_normal_(m.weight.data) if m.bias is not None: init.normal_(m.bias.data) elif isinstance(m, nn.ConvTranspose3d): init.xavier_normal_(m.weight.data) if m.bias is not None: init.normal_(m.bias.data) elif isinstance(m, nn.BatchNorm1d): init.normal_(m.weight.data, mean=1, std=0.02) # init.constant_(m.bias.data, 0) elif isinstance(m, nn.BatchNorm2d): init.normal_(m.weight.data, mean=1, std=0.02) # init.constant_(m.bias.data, 0) elif isinstance(m, nn.BatchNorm3d): init.normal_(m.weight.data, mean=1, std=0.02) init.constant_(m.bias.data, 0) elif isinstance(m, nn.Linear): # init.xavier_normal_(m.weight.data) init.xavier_uniform_(m.weight.data) # init.normal_(m.bias.data) init.constant_(m.bias.data, 0) elif isinstance(m, nn.LSTM): for param in m.parameters(): if len(param.shape) >= 2: init.orthogonal_(param.data) else: init.normal_(param.data) elif isinstance(m, nn.LSTMCell): for param in m.parameters(): if len(param.shape) >= 2: init.orthogonal_(param.data) else: init.normal_(param.data) elif isinstance(m, nn.GRU): for name, param in m.named_parameters(): if 'weight_ih' in name: torch.nn.init.xavier_uniform_(param.data) elif 'weight_hh' in name: torch.nn.init.orthogonal_(param.data) elif 'bias' in name: param.data.fill_(0) # for param in m.parameters(): # if len(param.shape) >= 2: # # init.orthogonal_(param.data) # init.orthogonal_(param.data) # else: # # init.normal_(param.data) # init.xavier_uniform_(param.data) elif isinstance(m, nn.GRUCell): for param in m.parameters(): if len(param.shape) >= 2: init.orthogonal_(param.data) else: init.normal_(param.data)
def weight_init(m): """ Usage: model = Model() model.apply(weight_init) """ if isinstance(m, nn.Conv1d): init.normal_(m.weight.data) if m.bias is not None: init.normal_(m.bias.data) elif isinstance(m, nn.Conv2d): init.xavier_normal_(m.weight.data) if m.bias is not None: init.normal_(m.bias.data) elif isinstance(m, nn.Conv3d): init.xavier_normal_(m.weight.data) if m.bias is not None: init.normal_(m.bias.data) elif isinstance(m, nn.ConvTranspose1d): init.normal_(m.weight.data) if m.bias is not None: init.normal_(m.bias.data) elif isinstance(m, nn.ConvTranspose2d): init.xavier_normal_(m.weight.data) if m.bias is not None: init.normal_(m.bias.data) elif isinstance(m, nn.ConvTranspose3d): init.xavier_normal_(m.weight.data) if m.bias is not None: init.normal_(m.bias.data) elif isinstance(m, nn.BatchNorm1d): init.normal_(m.weight.data, mean=1, std=0.02) init.constant_(m.bias.data, 0) elif isinstance(m, nn.BatchNorm2d): init.normal_(m.weight.data, mean=1, std=0.02) init.constant_(m.bias.data, 0) elif isinstance(m, nn.BatchNorm3d): init.normal_(m.weight.data, mean=1, std=0.02) init.constant_(m.bias.data, 0) elif isinstance(m, nn.Linear): init.xavier_normal_(m.weight.data) init.normal_(m.bias.data) elif isinstance(m, nn.LSTM): for param in m.parameters(): if len(param.shape) >= 2: init.orthogonal_(param.data) else: init.normal_(param.data) elif isinstance(m, nn.LSTMCell): for param in m.parameters(): if len(param.shape) >= 2: init.orthogonal_(param.data) else: init.normal_(param.data) elif isinstance(m, nn.GRU): for param in m.parameters(): if len(param.shape) >= 2: init.orthogonal_(param.data) else: init.normal_(param.data) elif isinstance(m, nn.GRUCell): for param in m.parameters(): if len(param.shape) >= 2: init.orthogonal_(param.data) else: init.normal_(param.data) elif isinstance(m, nn.Embedding): embed_size = m.weight.size(-1) if embed_size > 0: init_range = 0.5 / m.weight.size(-1) init.uniform_(m.weight.data, -init_range, init_range)
def __init__(self, input_size, output_size, use_noisy_net=False): super(CnnActorCriticNetwork, self).__init__() if use_noisy_net: print('use NoisyNet') linear = NoisyLinear else: linear = nn.Linear self.feature = nn.Sequential( nn.Conv2d(in_channels=4, out_channels=32, kernel_size=8, stride=4), nn.ReLU(), nn.Conv2d(in_channels=32, out_channels=64, kernel_size=4, stride=2), nn.ReLU(), nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1), nn.ReLU(), Flatten(), linear(2304, 256), nn.ReLU(), linear(256, 448), nn.ReLU()) self.actor = nn.Sequential(linear(448, 448), nn.ReLU(), linear(448, output_size)) self.extra_layer = nn.Sequential(linear(448, 448), nn.ReLU()) self.critic_ext = linear(448, 1) self.critic_int = linear(448, 1) for p in self.modules(): if isinstance(p, nn.Conv2d): init.orthogonal_(p.weight, np.sqrt(2)) p.bias.data.zero_() if isinstance(p, nn.Linear): init.orthogonal_(p.weight, np.sqrt(2)) p.bias.data.zero_() init.orthogonal_(self.critic_ext.weight, 0.01) self.critic_ext.bias.data.zero_() init.orthogonal_(self.critic_int.weight, 0.01) self.critic_int.bias.data.zero_() for i in range(len(self.actor)): if type(self.actor[i]) == nn.Linear: init.orthogonal_(self.actor[i].weight, 0.01) self.actor[i].bias.data.zero_() for i in range(len(self.extra_layer)): if type(self.extra_layer[i]) == nn.Linear: init.orthogonal_(self.extra_layer[i].weight, 0.1) self.extra_layer[i].bias.data.zero_()
def weight_init(m): # pylint: disable=too-many-branches, too-many-statements """ Function to initialize the weight of a layer. Usage: network = Model() network.apply(weight_init) """ if isinstance(m, nn.Conv1d): init.normal_(m.weight.data) if m.bias is not None: init.normal_(m.bias.data) elif isinstance(m, nn.Conv2d): init.xavier_normal_(m.weight.data) if m.bias is not None: init.normal_(m.bias.data) elif isinstance(m, nn.Conv3d): init.xavier_normal_(m.weight.data) if m.bias is not None: init.normal_(m.bias.data) elif isinstance(m, nn.ConvTranspose1d): init.normal_(m.weight.data) if m.bias is not None: init.normal_(m.bias.data) elif isinstance(m, nn.ConvTranspose2d): init.xavier_normal_(m.weight.data) if m.bias is not None: init.normal_(m.bias.data) elif isinstance(m, nn.ConvTranspose3d): init.xavier_normal_(m.weight.data) if m.bias is not None: init.normal_(m.bias.data) elif isinstance(m, nn.BatchNorm1d): init.normal_(m.weight.data, mean=1, std=0.02) init.constant_(m.bias.data, 0) elif isinstance(m, nn.BatchNorm2d): init.normal_(m.weight.data, mean=1, std=0.02) init.constant_(m.bias.data, 0) elif isinstance(m, nn.BatchNorm3d): init.normal_(m.weight.data, mean=1, std=0.02) init.constant_(m.bias.data, 0) elif isinstance(m, nn.Linear): init.xavier_normal_(m.weight.data) init.normal_(m.bias.data) elif isinstance(m, nn.LSTM): for param in m.parameters(): if len(param.shape) >= 2: init.orthogonal_(param.data) else: init.normal_(param.data) elif isinstance(m, nn.LSTMCell): for param in m.parameters(): if len(param.shape) >= 2: init.orthogonal_(param.data) else: init.normal_(param.data) elif isinstance(m, nn.GRU): for param in m.parameters(): if len(param.shape) >= 2: init.orthogonal_(param.data) else: init.normal_(param.data) elif isinstance(m, nn.GRUCell): for param in m.parameters(): if len(param.shape) >= 2: init.orthogonal_(param.data) else: init.normal_(param.data)
def __init__(self, hps, *_): super(BiLSTMTagger, self).__init__() batch_size = hps['batch_size'] lstm_hidden_dim = hps['sent_hdim'] sent_embedding_dim_DEP = 2 * hps['sent_edim'] sent_embedding_dim_SRL = 2 * hps['sent_edim'] + 16 + 16 ## for the region mark role_embedding_dim = hps['role_edim'] frame_embedding_dim = role_embedding_dim vocab_size = hps['vword'] self.tagset_size = hps['vbio'] self.pos_size = hps['vpos'] self.dep_size = hps['vdep'] self.frameset_size = hps['vframe'] self.num_layers = hps['rec_layers'] self.batch_size = batch_size self.hidden_dim = lstm_hidden_dim self.word_emb_dim = hps['sent_edim'] self.specific_dep_size = hps['svdep'] self.word_embeddings_SRL = nn.Embedding(vocab_size, hps['sent_edim']) self.word_embeddings_DEP = nn.Embedding(vocab_size, hps['sent_edim']) self.pos_embeddings = nn.Embedding(self.pos_size, hps['pos_edim']) self.pos_embeddings_DEP = nn.Embedding(self.pos_size, hps['pos_edim']) self.p_lemma_embeddings = nn.Embedding(self.frameset_size, hps['sent_edim']) self.dep_embeddings = nn.Embedding(self.dep_size, self.pos_size) self.region_embeddings = nn.Embedding(2, 16) #self.lr_dep_embeddings = nn.Embedding(self.lr_dep_size, hps[]) self.word_fixed_embeddings = nn.Embedding(vocab_size, hps['sent_edim']) self.word_fixed_embeddings.weight.data.copy_( torch.from_numpy(hps['word_embeddings'])) self.word_fixed_embeddings_DEP = nn.Embedding(vocab_size, hps['sent_edim']) self.word_fixed_embeddings_DEP.weight.data.copy_( torch.from_numpy(hps['word_embeddings'])) self.role_embeddings = nn.Embedding(self.tagset_size, role_embedding_dim) self.frame_embeddings = nn.Embedding(self.frameset_size, frame_embedding_dim) self.elmo_emb_size = 200 self.elmo_mlp_word = nn.Sequential(nn.Linear(1024, self.elmo_emb_size), nn.ReLU()) self.elmo_word = nn.Parameter(torch.Tensor([0.5, 0.5])) self.elmo_gamma_word = nn.Parameter(torch.ones(1)) self.elmo_mlp = nn.Sequential( nn.Linear(2 * lstm_hidden_dim, self.elmo_emb_size), nn.ReLU()) self.elmo_w = nn.Parameter(torch.Tensor([0.5, 0.5])) self.elmo_gamma = nn.Parameter(torch.ones(1)) self.SRL_input_dropout = nn.Dropout(p=0.3) self.DEP_input_dropout = nn.Dropout(p=0.3) self.DEP_link_H_dropout = nn.Dropout(p=0.5) self.DEP_link_M_dropout = nn.Dropout(p=0.5) self.DEP_tag_H_dropout = nn.Dropout(p=0.5) self.DEP_tag_M_dropout = nn.Dropout(p=0.5) self.SRL_input_dropout = nn.Dropout(p=0.3) self.DEP_input_dropout = nn.Dropout(p=0.3) self.hidden_state_dropout = nn.Dropout(p=0.3) self.dropout_1 = nn.Dropout(p=0.3) self.dropout_2 = nn.Dropout(p=0.3) self.hidden_state_dropout_SRL = nn.Dropout(p=0.3) self.dropout_1_SRL = nn.Dropout(p=0.3) self.dropout_2_SRL = nn.Dropout(p=0.3) #self.use_dropout = nn.Dropout(p=0.2) # The LSTM takes word embeddings as inputs, and outputs hidden states # with dimensionality hidden_dim. self.num_layers = 1 self.BiLSTM_0 = nn.LSTM(input_size=sent_embedding_dim_DEP, hidden_size=lstm_hidden_dim, batch_first=True, bidirectional=True, num_layers=self.num_layers) init.orthogonal_(self.BiLSTM_0.all_weights[0][0]) init.orthogonal_(self.BiLSTM_0.all_weights[0][1]) init.orthogonal_(self.BiLSTM_0.all_weights[1][0]) init.orthogonal_(self.BiLSTM_0.all_weights[1][1]) self.num_layers = 1 self.BiLSTM_1 = nn.LSTM(input_size=lstm_hidden_dim * 2, hidden_size=lstm_hidden_dim, batch_first=True, bidirectional=True, num_layers=self.num_layers) init.orthogonal_(self.BiLSTM_1.all_weights[0][0]) init.orthogonal_(self.BiLSTM_1.all_weights[0][1]) init.orthogonal_(self.BiLSTM_1.all_weights[1][0]) init.orthogonal_(self.BiLSTM_1.all_weights[1][1]) self.num_layers = 3 self.BiLSTM_SRL = nn.LSTM(input_size=sent_embedding_dim_SRL, hidden_size=lstm_hidden_dim, batch_first=True, bidirectional=True, num_layers=self.num_layers) init.orthogonal_(self.BiLSTM_SRL.all_weights[0][0]) init.orthogonal_(self.BiLSTM_SRL.all_weights[0][1]) init.orthogonal_(self.BiLSTM_SRL.all_weights[1][0]) init.orthogonal_(self.BiLSTM_SRL.all_weights[1][1]) self.ldims = lstm_hidden_dim self.hidLayerFOH_link = nn.Linear(self.ldims * 2, self.ldims) self.hidLayerFOM_link = nn.Linear(self.ldims * 2, self.ldims) self.W_R_link = nn.Parameter( torch.rand(lstm_hidden_dim + 1, 1 + lstm_hidden_dim)) self.hidLayerFOH_tag = nn.Linear(self.ldims * 2, self.ldims) self.hidLayerFOM_tag = nn.Linear(self.ldims * 2, self.ldims) self.W_R_tag = nn.Parameter( torch.rand(lstm_hidden_dim + 1, self.dep_size * (1 + lstm_hidden_dim))) self.Non_Predicate_Proj = nn.Linear(2 * lstm_hidden_dim, lstm_hidden_dim) self.Predicate_Proj = nn.Linear(2 * lstm_hidden_dim, lstm_hidden_dim) self.W_R = nn.Parameter( torch.rand(lstm_hidden_dim + 1, self.tagset_size * (lstm_hidden_dim + 1))) self.VR_word_embedding = nn.Parameter( torch.from_numpy(np.ones((1, self.word_emb_dim), dtype='float32'))) self.VR_word_embedding_random = nn.Parameter( torch.from_numpy(np.ones((1, self.word_emb_dim), dtype='float32'))) # Init hidden state self.hidden = self.init_hidden_spe() self.hidden_2 = self.init_hidden_spe() self.hidden_3 = self.init_hidden_spe() self.hidden_4 = self.init_hidden_share()
def init_hyper(m): if type(m) == nn.Linear: orthogonal_(m.weight.data, gain=1.0) m.bias.data.fill_(0.0)
def __init__( self, input_dim, hidden_dim, kernel_size, padding_mode='zeros', batchnorm=True, use_attention=True, timesteps=64 ): # Timesteps is funky here... but go ahead and try this until you figure out the exact training length " Referenced from https://github.com/happyjin/ConvGRU-pytorch" super(ConvGRUCell, self).__init__() self.padding = kernel_size // 2 hidden_size = hidden_dim self.batchnorm = batchnorm self.timesteps = timesteps self.use_attention = use_attention if self.use_attention: self.a_wu_gate = nn.Conv2d(hidden_size + input_dim, hidden_size, 1, padding=1 // 2) init.orthogonal_(self.a_wu_gate.weight) init.constant_(self.a_wu_gate.bias, 1.) self.i_w_gate = nn.Conv2d(hidden_size + input_dim, hidden_size, 1) self.e_w_gate = nn.Conv2d(hidden_size * 2, hidden_size, 1) self.inh_init = nn.Conv2d(1, hidden_size, 1, padding=1 // 2) self.exc_init = nn.Conv2d(1, hidden_size, 1, padding=1 // 2) spatial_h_size = kernel_size self.h_padding = spatial_h_size // 2 self.w_exc = nn.Parameter( torch.empty(hidden_size, hidden_size, spatial_h_size, spatial_h_size)) self.w_inh = nn.Parameter( torch.empty(hidden_size, hidden_size, spatial_h_size, spatial_h_size)) self.alpha = nn.Parameter(torch.empty((hidden_size, 1, 1))) self.mu = nn.Parameter(torch.empty((hidden_size, 1, 1))) self.gamma = nn.Parameter(torch.empty((hidden_size, 1, 1))) self.kappa = nn.Parameter(torch.empty((hidden_size, 1, 1))) self.bn = nn.ModuleList([ nn.GroupNorm(1, hidden_size, eps=1e-03, affine=True) for i in range(2) ]) init.orthogonal_(self.w_inh) init.orthogonal_(self.w_exc) init.orthogonal_(self.i_w_gate.weight) init.orthogonal_(self.e_w_gate.weight) for bn in self.bn: init.constant_(bn.weight, 0.1) init.uniform_(self.alpha, a=0., b=0.1) init.uniform_(self.mu, a=0., b=0.1) init.uniform_(self.gamma, a=0., b=0.1) init.uniform_(self.kappa, a=0., b=0.1) # Init gate biases init.uniform_(self.i_w_gate.bias.data, 1, self.timesteps - 1) self.i_w_gate.bias.data.log() self.e_w_gate.bias.data = -self.i_w_gate.bias.data
def __init__(self, hps, *_): super(BiLSTMTagger, self).__init__() batch_size = hps['batch_size'] lstm_hidden_dim = hps['sent_hdim'] sent_embedding_dim_DEP = 2 * hps['sent_edim'] + 16 sent_embedding_dim_SRL = 2 * hps['sent_edim'] + 16 ## for the region mark role_embedding_dim = hps['role_edim'] frame_embedding_dim = role_embedding_dim vocab_size = hps['vword'] self.tagset_size = hps['vbio'] self.pos_size = hps['vpos'] self.dep_size = hps['vdep'] self.char_size = hps['vchar'] self.frameset_size = hps['vframe'] self.num_layers = hps['rec_layers'] self.batch_size = batch_size self.hidden_dim = lstm_hidden_dim self.word_emb_dim = hps['sent_edim'] self.specific_dep_size = hps['svdep'] self.char_embeddings = nn.Embedding(self.char_size, 50) self.word_embeddings_SRL = nn.Embedding(vocab_size, hps['sent_edim']) self.word_embeddings_DEP = nn.Embedding(vocab_size, hps['sent_edim']) self.pos_embeddings = nn.Embedding(self.pos_size, hps['pos_edim']) self.pos_embeddings_DEP = nn.Embedding(self.pos_size, hps['pos_edim']) self.p_lemma_embeddings = nn.Embedding(self.frameset_size, hps['sent_edim']) self.dep_embeddings = nn.Embedding(self.dep_size, self.pos_size) self.region_embeddings = nn.Embedding(2, 16) #self.lr_dep_embeddings = nn.Embedding(self.lr_dep_size, hps[]) self.word_fixed_embeddings = nn.Embedding(vocab_size, hps['sent_edim']) self.word_fixed_embeddings.weight.data.copy_( torch.from_numpy(hps['word_embeddings'])) self.word_fixed_embeddings_DEP = nn.Embedding(vocab_size, hps['sent_edim']) self.word_fixed_embeddings_DEP.weight.data.copy_( torch.from_numpy(hps['word_embeddings'])) self.word_fixed_embeddings_DEP = nn.Embedding(vocab_size, hps['sent_edim']) self.word_fixed_embeddings_DEP.weight.data.copy_( torch.from_numpy(hps['word_embeddings'])) self.charCNN = layer.CharCNN(num_of_conv=3, in_channels=1, out_channels=50, kernel_size=[2, 3, 4], in_features=50, out_features=100) self.role_embeddings = nn.Embedding(self.tagset_size, role_embedding_dim) self.frame_embeddings = nn.Embedding(self.frameset_size, frame_embedding_dim) self.hidden2tag = nn.Linear(4 * lstm_hidden_dim, 2 * lstm_hidden_dim) self.MLP = nn.Linear(2 * lstm_hidden_dim, self.dep_size) self.tag2hidden = nn.Linear(self.dep_size, self.pos_size) self.hidden2tag_spe = nn.Linear(2 * lstm_hidden_dim, 2 * lstm_hidden_dim) self.MLP_spe = nn.Linear(2 * lstm_hidden_dim, 4) self.tag2hidden_spe = nn.Linear(4, self.pos_size) #self.elmo_embeddings_0 = nn.Embedding(vocab_size, 1024) #self.elmo_embeddings_0.weight.data.copy_(torch.from_numpy(hps['elmo_embeddings_0'])) #self.elmo_embeddings_1 = nn.Embedding(vocab_size, 1024) #self.elmo_embeddings_1.weight.data.copy_(torch.from_numpy(hps['elmo_embeddings_1'])) self.elmo_emb_size = 200 self.elmo_mlp_word = nn.Sequential(nn.Linear(1024, self.elmo_emb_size), nn.ReLU()) self.elmo_word = nn.Parameter(torch.Tensor([0.5, 0.5])) self.elmo_gamma_word = nn.Parameter(torch.ones(1)) self.elmo_mlp = nn.Sequential( nn.Linear(2 * lstm_hidden_dim, self.elmo_emb_size), nn.ReLU()) self.elmo_w = nn.Parameter(torch.Tensor([0.5, 0.5])) self.elmo_gamma = nn.Parameter(torch.ones(1)) self.SRL_input_dropout = nn.Dropout(p=0.3) self.DEP_input_dropout = nn.Dropout(p=0.3) self.hidden_state_dropout = nn.Dropout(p=0.3) self.SRL_input_dropout = nn.Dropout(p=0.3) self.DEP_input_dropout = nn.Dropout(p=0.3) self.hidden_state_dropout = nn.Dropout(p=0.3) self.dropout_1 = nn.Dropout(p=0.3) self.dropout_2 = nn.Dropout(p=0.3) self.hidden_state_dropout_SRL = nn.Dropout(p=0.3) self.dropout_1_SRL = nn.Dropout(p=0.3) self.dropout_2_SRL = nn.Dropout(p=0.3) #self.use_dropout = nn.Dropout(p=0.2) # The LSTM takes word embeddings as inputs, and outputs hidden states # with dimensionality hidden_dim. self.num_layers = 1 self.BiLSTM_0 = nn.LSTM(input_size=sent_embedding_dim_DEP, hidden_size=lstm_hidden_dim, batch_first=True, bidirectional=True, num_layers=self.num_layers) init.orthogonal_(self.BiLSTM_0.all_weights[0][0]) init.orthogonal_(self.BiLSTM_0.all_weights[0][1]) init.orthogonal_(self.BiLSTM_0.all_weights[1][0]) init.orthogonal_(self.BiLSTM_0.all_weights[1][1]) self.num_layers = 1 self.BiLSTM_1 = nn.LSTM(input_size=lstm_hidden_dim * 2, hidden_size=lstm_hidden_dim, batch_first=True, bidirectional=True, num_layers=self.num_layers) init.orthogonal_(self.BiLSTM_1.all_weights[0][0]) init.orthogonal_(self.BiLSTM_1.all_weights[0][1]) init.orthogonal_(self.BiLSTM_1.all_weights[1][0]) init.orthogonal_(self.BiLSTM_1.all_weights[1][1]) self.num_layers = 3 self.BiLSTM_SRL = nn.LSTM(input_size=sent_embedding_dim_SRL + self.elmo_emb_size * 1, hidden_size=lstm_hidden_dim, batch_first=True, bidirectional=True, num_layers=self.num_layers) init.orthogonal_(self.BiLSTM_SRL.all_weights[0][0]) init.orthogonal_(self.BiLSTM_SRL.all_weights[0][1]) init.orthogonal_(self.BiLSTM_SRL.all_weights[1][0]) init.orthogonal_(self.BiLSTM_SRL.all_weights[1][1]) self.Non_Predicate_Proj_DEP = nn.Linear(2 * lstm_hidden_dim, lstm_hidden_dim) self.Predicate_Proj_DEP = nn.Linear(2 * lstm_hidden_dim, lstm_hidden_dim) self.W_R_DEP = nn.Parameter( torch.rand(lstm_hidden_dim + 1, self.dep_size * lstm_hidden_dim)) self.Non_Predicate_Proj = nn.Linear(2 * lstm_hidden_dim, lstm_hidden_dim) self.Predicate_Proj = nn.Linear(2 * lstm_hidden_dim, lstm_hidden_dim) self.W_R = nn.Parameter( torch.rand(lstm_hidden_dim + 1, self.tagset_size * (lstm_hidden_dim + 1))) # Init hidden state self.hidden = self.init_hidden_spe() self.hidden_2 = self.init_hidden_spe() self.hidden_3 = self.init_hidden_spe() self.hidden_4 = self.init_hidden_share() # Init hidden state self.hidden = self.init_hidden_spe() self.hidden_2 = self.init_hidden_spe() self.hidden_3 = self.init_hidden_spe() self.hidden_DEP_base = self.init_hidden_spe() self.hidden_DEP = self.init_hidden_spe() self.hidden_SRL_base = self.init_hidden_spe() self.hidden_SRL = self.init_hidden_SRL() self.hidden_PI = self.init_hidden_share()
def __init__(self, hps, *_): super(BiLSTMTagger, self).__init__() batch_size = hps['batch_size'] lstm_hidden_dim = hps['sent_hdim'] sent_embedding_dim_DEP = 2 * hps['sent_edim'] sent_embedding_dim_SRL = 2 * hps['sent_edim'] + 0 * hps['pos_edim'] + 16 self.sent_embedding_dim_DEP = sent_embedding_dim_DEP ## for the region mark role_embedding_dim = hps['role_edim'] frame_embedding_dim = role_embedding_dim vocab_size = hps['vword'] self.tagset_size = hps['vbio'] self.pos_size = hps['vpos'] self.dep_size = hps['vdep'] self.frameset_size = hps['vframe'] self.num_layers = hps['rec_layers'] self.batch_size = batch_size self.hidden_dim = lstm_hidden_dim self.word_emb_dim = hps['sent_edim'] self.specific_dep_size = hps['svdep'] self.word_embeddings_SRL = nn.Embedding(vocab_size, hps['sent_edim']) self.word_embeddings_DEP = nn.Embedding(vocab_size, hps['sent_edim']) self.pos_embeddings = nn.Embedding(self.pos_size, hps['pos_edim']) self.pos_embeddings_DEP = nn.Embedding(self.pos_size, hps['pos_edim']) self.p_lemma_embeddings = nn.Embedding(self.frameset_size, hps['sent_edim']) self.dep_embeddings = nn.Embedding(self.dep_size, self.pos_size) self.region_embeddings = nn.Embedding(2, 16) # self.lr_dep_embeddings = nn.Embedding(self.lr_dep_size, hps[]) self.word_fixed_embeddings = nn.Embedding(vocab_size, hps['sent_edim']) self.word_fixed_embeddings.weight.data.copy_( torch.from_numpy(hps['word_embeddings'])) self.word_fixed_embeddings_DEP = nn.Embedding(vocab_size, hps['sent_edim']) self.word_fixed_embeddings_DEP.weight.data.copy_( torch.from_numpy(hps['word_embeddings'])) self.role_embeddings = nn.Embedding(self.tagset_size, role_embedding_dim) self.frame_embeddings = nn.Embedding(self.frameset_size, frame_embedding_dim) self.hidden2tag = nn.Linear(4 * lstm_hidden_dim, 2 * lstm_hidden_dim) self.MLP = nn.Linear(2 * lstm_hidden_dim, self.dep_size) self.tag2hidden = nn.Linear(self.dep_size, self.pos_size) self.hidden2tag_spe = nn.Linear(2 * lstm_hidden_dim, 2 * lstm_hidden_dim) self.MLP_spe = nn.Linear(2 * lstm_hidden_dim, 4) self.tag2hidden_spe = nn.Linear(4, self.pos_size) # self.elmo_embeddings_0 = nn.Embedding(vocab_size, 1024) # self.elmo_embeddings_0.weight.data.copy_(torch.from_numpy(hps['elmo_embeddings_0'])) # self.elmo_embeddings_1 = nn.Embedding(vocab_size, 1024) # self.elmo_embeddings_1.weight.data.copy_(torch.from_numpy(hps['elmo_embeddings_1'])) self.elmo_emb_size = 200 self.elmo_mlp_word = nn.Sequential(nn.Linear(1024, self.elmo_emb_size), nn.ReLU()) self.elmo_word = nn.Parameter(torch.Tensor([0.5, 0.5])) self.elmo_gamma_word = nn.Parameter(torch.ones(1)) self.elmo_mlp = nn.Sequential( nn.Linear(2 * lstm_hidden_dim, self.elmo_emb_size), nn.ReLU()) self.elmo_w = nn.Parameter(torch.Tensor([0.5, 0.5])) self.elmo_gamma = nn.Parameter(torch.ones(1)) self.SRL_input_dropout = nn.Dropout(p=0.5) self.DEP_input_dropout = nn.Dropout(p=0.5) self.hidden_state_dropout_DEP = nn.Dropout(p=0.3) self.hidden_state_dropout_1 = nn.Dropout(p=0.5) self.hidden_state_dropout_2 = nn.Dropout(p=0.5) self.head_dropout = nn.Dropout(p=0.5) self.dep_dropout = nn.Dropout(p=0.5) self.DEP_input_dropout_unlabeled = nn.Dropout(p=0.2) self.hidden_state_dropout_1_unlabeled = nn.Dropout(p=0.2) self.hidden_state_dropout_2_unlabeled = nn.Dropout(p=0.2) self.head_dropout_unlabeled = nn.Dropout(p=0.2) self.dep_dropout_unlabeled = nn.Dropout(p=0.2) self.head_dropout_unlabeled_FF = nn.Dropout(p=0.2) self.dep_dropout_unlabeled_FF = nn.Dropout(p=0.2) self.head_dropout_unlabeled_BB = nn.Dropout(p=0.2) self.dep_dropout_unlabeled_BB = nn.Dropout(p=0.2) self.head_dropout_unlabeled_FB = nn.Dropout(p=0.2) self.dep_dropout_unlabeled_FB = nn.Dropout(p=0.2) self.head_dropout_unlabeled_BF = nn.Dropout(p=0.2) self.dep_dropout_unlabeled_BF = nn.Dropout(p=0.2) # self.use_dropout = nn.Dropout(p=0.2) # The LSTM takes word embeddings as inputs, and outputs hidden states # with dimensionality hidden_dim. self.SA_primary_num_layers = 1 self.BiLSTM_SA_primary = nn.LSTM(input_size=sent_embedding_dim_DEP, hidden_size=lstm_hidden_dim, batch_first=True, bidirectional=True, num_layers=self.SA_primary_num_layers) init.orthogonal_(self.BiLSTM_SA_primary.all_weights[0][0]) init.orthogonal_(self.BiLSTM_SA_primary.all_weights[0][1]) init.orthogonal_(self.BiLSTM_SA_primary.all_weights[1][0]) init.orthogonal_(self.BiLSTM_SA_primary.all_weights[1][1]) self.SA_high_num_layers = 1 self.BiLSTM_SA_high = nn.LSTM(input_size=lstm_hidden_dim * 2, hidden_size=lstm_hidden_dim, batch_first=True, bidirectional=True, num_layers=self.SA_high_num_layers) init.orthogonal_(self.BiLSTM_SA_high.all_weights[0][0]) init.orthogonal_(self.BiLSTM_SA_high.all_weights[0][1]) init.orthogonal_(self.BiLSTM_SA_high.all_weights[1][0]) init.orthogonal_(self.BiLSTM_SA_high.all_weights[1][1]) self.SRL_primary_num_layers = 1 self.BiLSTM_SRL_primary = nn.LSTM( input_size=sent_embedding_dim_SRL, hidden_size=lstm_hidden_dim, batch_first=True, bidirectional=True, num_layers=self.SRL_primary_num_layers) init.orthogonal_(self.BiLSTM_SRL_primary.all_weights[0][0]) init.orthogonal_(self.BiLSTM_SRL_primary.all_weights[0][1]) init.orthogonal_(self.BiLSTM_SRL_primary.all_weights[1][0]) init.orthogonal_(self.BiLSTM_SRL_primary.all_weights[1][1]) self.SRL_high_num_layers = 2 self.BiLSTM_SRL_high = nn.LSTM(input_size=2 * lstm_hidden_dim, hidden_size=lstm_hidden_dim, batch_first=True, bidirectional=True, num_layers=self.SRL_high_num_layers) init.orthogonal_(self.BiLSTM_SRL_high.all_weights[0][0]) init.orthogonal_(self.BiLSTM_SRL_high.all_weights[0][1]) init.orthogonal_(self.BiLSTM_SRL_high.all_weights[1][0]) init.orthogonal_(self.BiLSTM_SRL_high.all_weights[1][1]) # non-linear map to role embedding self.role_map = nn.Linear(in_features=role_embedding_dim * 2, out_features=self.hidden_dim * 4) self.map_dim = lstm_hidden_dim self.ldims = lstm_hidden_dim self.hidLayerFOH_SRL = nn.Linear(self.ldims * 2, self.ldims) self.hidLayerFOM_SRL = nn.Linear(self.ldims * 2, self.ldims) self.W_R_SRL = nn.Parameter( torch.rand(lstm_hidden_dim + 1, self.tagset_size * (lstm_hidden_dim + 1))) self.hidLayerFOH_SRL_FF = nn.Linear(self.ldims, self.ldims) self.hidLayerFOM_SRL_FF = nn.Linear(self.ldims, self.ldims) self.W_R_SRL_FF = nn.Parameter( torch.rand(lstm_hidden_dim + 1, self.tagset_size * (lstm_hidden_dim + 1))) self.hidLayerFOH_SRL_BB = nn.Linear(self.ldims, self.ldims) self.hidLayerFOM_SRL_BB = nn.Linear(self.ldims, self.ldims) self.W_R_SRL_BB = nn.Parameter( torch.rand(lstm_hidden_dim + 1, self.tagset_size * (lstm_hidden_dim + 1))) self.hidLayerFOH_SRL_BF = nn.Linear(self.ldims, self.ldims) self.hidLayerFOM_SRL_BF = nn.Linear(self.ldims, self.ldims) self.W_R_SRL_BF = nn.Parameter( torch.rand(lstm_hidden_dim + 1, self.tagset_size * (lstm_hidden_dim + 1))) self.hidLayerFOH_SRL_FB = nn.Linear(self.ldims, self.ldims) self.hidLayerFOM_SRL_FB = nn.Linear(self.ldims, self.ldims) self.W_R_SRL_FB = nn.Parameter( torch.rand(lstm_hidden_dim + 1, self.tagset_size * (lstm_hidden_dim + 1))) self.VR_embedding = nn.Parameter( torch.from_numpy( np.ones((1, sent_embedding_dim_DEP), dtype='float32'))) self.mid_hidden = lstm_hidden_dim self.POS_MLP = nn.Sequential( nn.Linear(2 * lstm_hidden_dim, lstm_hidden_dim), nn.ReLU(), nn.Linear(lstm_hidden_dim, self.pos_size)) self.SRL_primary_hidden = self.init_SRL_primary() self.SRL_high_hidden = self.init_SRL_high() self.SA_primary_hidden = self.init_SA_primary() self.SA_high_hidden = self.init_SA_high()
def __init__( self, d_feat=6, output_dim=1, freq_dim=10, hidden_size=64, dropout_W=0.0, dropout_U=0.0, device="cpu", ): super().__init__() self.input_dim = d_feat self.output_dim = output_dim self.freq_dim = freq_dim self.hidden_dim = hidden_size self.device = device self.W_i = nn.Parameter( init.xavier_uniform_(torch.empty( (self.input_dim, self.hidden_dim)))) self.U_i = nn.Parameter( init.orthogonal_(torch.empty(self.hidden_dim, self.hidden_dim))) self.b_i = nn.Parameter(torch.zeros(self.hidden_dim)) self.W_ste = nn.Parameter( init.xavier_uniform_(torch.empty(self.input_dim, self.hidden_dim))) self.U_ste = nn.Parameter( init.orthogonal_(torch.empty(self.hidden_dim, self.hidden_dim))) self.b_ste = nn.Parameter(torch.ones(self.hidden_dim)) self.W_fre = nn.Parameter( init.xavier_uniform_(torch.empty(self.input_dim, self.freq_dim))) self.U_fre = nn.Parameter( init.orthogonal_(torch.empty(self.hidden_dim, self.freq_dim))) self.b_fre = nn.Parameter(torch.ones(self.freq_dim)) self.W_c = nn.Parameter( init.xavier_uniform_(torch.empty(self.input_dim, self.hidden_dim))) self.U_c = nn.Parameter( init.orthogonal_(torch.empty(self.hidden_dim, self.hidden_dim))) self.b_c = nn.Parameter(torch.zeros(self.hidden_dim)) self.W_o = nn.Parameter( init.xavier_uniform_(torch.empty(self.input_dim, self.hidden_dim))) self.U_o = nn.Parameter( init.orthogonal_(torch.empty(self.hidden_dim, self.hidden_dim))) self.b_o = nn.Parameter(torch.zeros(self.hidden_dim)) self.U_a = nn.Parameter(init.orthogonal_(torch.empty(self.freq_dim, 1))) self.b_a = nn.Parameter(torch.zeros(self.hidden_dim)) self.W_p = nn.Parameter( init.xavier_uniform_(torch.empty(self.hidden_dim, self.output_dim))) self.b_p = nn.Parameter(torch.zeros(self.output_dim)) self.activation = nn.Tanh() self.inner_activation = nn.Hardsigmoid() self.dropout_W, self.dropout_U = (dropout_W, dropout_U) self.fc_out = nn.Linear(self.output_dim, 1) self.states = []
def weight_init(m): ''' Usage: model = Model() model.apply(weight_init) ''' if isinstance(m, nn.Conv1d): init.normal_(m.weight.data) if m.bias is not None: init.normal_(m.bias.data) elif isinstance(m, nn.Conv2d): init.xavier_normal_(m.weight.data) if m.bias is not None: init.normal_(m.bias.data) elif isinstance(m, nn.Conv3d): init.xavier_normal_(m.weight.data) if m.bias is not None: init.normal_(m.bias.data) elif isinstance(m, nn.ConvTranspose1d): init.normal_(m.weight.data) if m.bias is not None: init.normal_(m.bias.data) elif isinstance(m, nn.ConvTranspose2d): init.xavier_normal_(m.weight.data) if m.bias is not None: init.normal_(m.bias.data) elif isinstance(m, nn.ConvTranspose3d): init.xavier_normal_(m.weight.data) if m.bias is not None: init.normal_(m.bias.data) elif isinstance(m, nn.BatchNorm1d): init.normal_(m.weight.data, mean=1, std=0.02) init.constant_(m.bias.data, 0) elif isinstance(m, nn.BatchNorm2d): init.normal_(m.weight.data, mean=1, std=0.02) init.constant_(m.bias.data, 0) elif isinstance(m, nn.BatchNorm3d): init.normal_(m.weight.data, mean=1, std=0.02) init.constant_(m.bias.data, 0) elif isinstance(m, nn.Linear): init.xavier_normal_(m.weight.data) init.normal_(m.bias.data) elif isinstance(m, nn.LSTM): for param in m.parameters(): if len(param.shape) >= 2: init.orthogonal_(param.data) else: init.normal_(param.data) elif isinstance(m, nn.LSTMCell): for param in m.parameters(): if len(param.shape) >= 2: init.orthogonal_(param.data) else: init.normal_(param.data) elif isinstance(m, nn.GRU): for param in m.parameters(): if len(param.shape) >= 2: init.orthogonal_(param.data) else: init.normal_(param.data) elif isinstance(m, nn.GRUCell): for param in m.parameters(): if len(param.shape) >= 2: init.orthogonal_(param.data) else: init.normal_(param.data)
def _initialize_weights(self): init.orthogonal_(self.conv1.weight, init.calculate_gain("relu")) init.orthogonal_(self.conv2.weight, init.calculate_gain("relu")) init.orthogonal_(self.conv3.weight, init.calculate_gain("relu")) init.orthogonal_(self.conv4.weight)
def init_weight(self): init.xavier_normal_(self.hidden_proj.weight) init.orthogonal_(self.gru.weight_hh_l0) init.orthogonal_(self.gru.weight_ih_l0) self.gru.bias_ih_l0.data.fill_(0.0) self.gru.bias_hh_l0.data.fill_(0.0)
def init_weights(self): for w in self.rnn.parameters(): if w.dim() > 1: weight_init.orthogonal_(w)
def reset_parameters(self): init.orthogonal_(self.weight, self.gain) if self.bias is not None: fan_in, _ = init._calculate_fan_in_and_fan_out(self.weight) bound = 1 / math.sqrt(fan_in) init.uniform_(self.bias, -bound, bound)
def init_params(self): for layer in range(len(self.LSTMLayer.all_weights)): init.orthogonal_(self.LSTMLayer.all_weights[layer][0]) init.orthogonal_(self.LSTMLayer.all_weights[layer][1]) init.zeros_(self.LSTMLayer.all_weights[layer][2]) init.zeros_(self.LSTMLayer.all_weights[layer][3])
def _initialize_weights(self): init.orthogonal_(self.conv1.weight, init.calculate_gain('relu')) init.orthogonal_(self.conv2.weight, init.calculate_gain('relu')) init.orthogonal_(self.conv3.weight, init.calculate_gain('relu')) init.orthogonal_(self.conv4.weight)
def __init__(self, hps, *_): super(BiLSTMTagger, self).__init__() batch_size = hps['batch_size'] lstm_hidden_dim = hps['sent_hdim'] sent_embedding_dim_DEP = 2 * hps['sent_edim'] + 1 * hps['pos_edim'] sent_embedding_dim_SRL = 2 * hps['sent_edim'] + 16 ## for the region mark role_embedding_dim = hps['role_edim'] frame_embedding_dim = role_embedding_dim vocab_size = hps['vword'] self.tagset_size = hps['vbio'] self.pos_size = hps['vpos'] self.dep_size = hps['vdep'] self.frameset_size = hps['vframe'] self.num_layers = hps['rec_layers'] self.batch_size = batch_size self.hidden_dim = lstm_hidden_dim self.word_emb_dim = hps['sent_edim'] self.specific_dep_size = hps['svdep'] self.word_embeddings_SRL = nn.Embedding(vocab_size, hps['sent_edim']) self.word_embeddings_DEP = nn.Embedding(vocab_size, hps['sent_edim']) self.pos_embeddings = nn.Embedding(self.pos_size, hps['pos_edim']) self.pos_embeddings_DEP = nn.Embedding(self.pos_size, hps['pos_edim']) self.p_lemma_embeddings = nn.Embedding(self.frameset_size, hps['sent_edim']) self.dep_embeddings = nn.Embedding(self.dep_size, self.pos_size) self.region_embeddings = nn.Embedding(2, 16) #self.lr_dep_embeddings = nn.Embedding(self.lr_dep_size, hps[]) self.word_fixed_embeddings = nn.Embedding(vocab_size, hps['sent_edim']) self.word_fixed_embeddings.weight.data.copy_( torch.from_numpy(hps['word_embeddings'])) self.word_fixed_embeddings_DEP = nn.Embedding(vocab_size, hps['sent_edim']) self.word_fixed_embeddings_DEP.weight.data.copy_( torch.from_numpy(hps['word_embeddings'])) self.role_embeddings = nn.Embedding(self.tagset_size, role_embedding_dim) self.frame_embeddings = nn.Embedding(self.frameset_size, frame_embedding_dim) self.hidden2tag = nn.Linear(4 * lstm_hidden_dim, 2 * lstm_hidden_dim) self.MLP = nn.Linear(2 * lstm_hidden_dim, self.specific_dep_size) self.tag2hidden = nn.Linear(self.specific_dep_size, self.pos_size) self.Head_Proj = nn.Linear(4 * lstm_hidden_dim, lstm_hidden_dim) self.W_share = nn.Parameter( torch.rand(lstm_hidden_dim, self.dep_size * lstm_hidden_dim)) self.Dep_Proj = nn.Linear(4 * lstm_hidden_dim, lstm_hidden_dim) self.MLP_identification = nn.Linear(4 * lstm_hidden_dim, 2 * lstm_hidden_dim) self.Idenficiation = nn.Linear(2 * lstm_hidden_dim, 3) self.Non_Predicate_Proj = nn.Linear(2 * lstm_hidden_dim, lstm_hidden_dim) self.Predicate_Proj = nn.Linear(2 * lstm_hidden_dim, lstm_hidden_dim) self.MLP_classifier_1 = nn.Linear(400, 400) self.MLP_classifier_0 = nn.Linear(400, self.tagset_size) self.elmo_emb_size = 200 self.elmo_mlp_word = nn.Sequential(nn.Linear(1024, self.elmo_emb_size), nn.ReLU()) self.elmo_word = nn.Parameter(torch.Tensor([0.5, 0.5])) self.elmo_gamma_word = nn.Parameter(torch.ones(1)) self.elmo_mlp = nn.Sequential( nn.Linear(2 * lstm_hidden_dim, self.elmo_emb_size), nn.ReLU()) self.elmo_w = nn.Parameter(torch.Tensor([0.5, 0.5])) self.elmo_gamma = nn.Parameter(torch.ones(1)) self.SRL_input_dropout = nn.Dropout(p=0.3) self.DEP_input_dropout = nn.Dropout(p=0.3) self.hidden_state_dropout = nn.Dropout(p=0.3) self.dropout_1 = nn.Dropout(p=0.3) self.dropout_2 = nn.Dropout(p=0.3) self.label_dropout_3 = nn.Dropout(p=0.3) self.label_dropout_4 = nn.Dropout(p=0.3) self.id_dropout = nn.Dropout(p=0.3) #self.use_dropout = nn.Dropout(p=0.2) # The LSTM takes word embeddings as inputs, and outputs hidden states # with dimensionality hidden_dim. self.num_layers = 1 self.BiLSTM_0 = nn.LSTM(input_size=sent_embedding_dim_DEP, hidden_size=lstm_hidden_dim, batch_first=True, bidirectional=True, num_layers=self.num_layers) init.orthogonal_(self.BiLSTM_0.all_weights[0][0]) init.orthogonal_(self.BiLSTM_0.all_weights[0][1]) init.orthogonal_(self.BiLSTM_0.all_weights[1][0]) init.orthogonal_(self.BiLSTM_0.all_weights[1][1]) self.num_layers = 1 self.BiLSTM_1 = nn.LSTM(input_size=lstm_hidden_dim * 2, hidden_size=lstm_hidden_dim, batch_first=True, bidirectional=True, num_layers=self.num_layers) init.orthogonal_(self.BiLSTM_1.all_weights[0][0]) init.orthogonal_(self.BiLSTM_1.all_weights[0][1]) init.orthogonal_(self.BiLSTM_1.all_weights[1][0]) init.orthogonal_(self.BiLSTM_1.all_weights[1][1]) self.num_layers = 2 self.BiLSTM_SRL = nn.LSTM(input_size=sent_embedding_dim_SRL, hidden_size=lstm_hidden_dim, batch_first=True, bidirectional=True, num_layers=self.num_layers) init.orthogonal_(self.BiLSTM_SRL.all_weights[0][0]) init.orthogonal_(self.BiLSTM_SRL.all_weights[0][1]) init.orthogonal_(self.BiLSTM_SRL.all_weights[1][0]) init.orthogonal_(self.BiLSTM_SRL.all_weights[1][1]) self.More_1 = nn.Linear(2 * lstm_hidden_dim, lstm_hidden_dim) self.W_R = nn.Linear(lstm_hidden_dim, self.dep_size) # Init hidden state self.hidden = self.init_hidden_spe() self.hidden_2 = self.init_hidden_spe() self.hidden_3 = self.init_hidden_spe() self.hidden_4 = self.init_hidden_share()
def weight_init(m): # https://gist.github.com/jeasinema/ed9236ce743c8efaf30fa2ff732749f5 if isinstance(m, nn.Conv1d): init.normal_(m.weight.data) if m.bias is not None: init.normal_(m.bias.data) elif isinstance(m, nn.Conv2d): init.kaiming_normal_(m.weight.data) if m.bias is not None: init.normal_(m.bias.data) elif isinstance(m, nn.Conv3d): init.kaiming_normal_(m.weight.data) if m.bias is not None: init.normal_(m.bias.data) elif isinstance(m, nn.ConvTranspose1d): init.normal_(m.weight.data) if m.bias is not None: init.normal_(m.bias.data) elif isinstance(m, nn.ConvTranspose2d): init.kaiming_normal_(m.weight.data) if m.bias is not None: init.normal_(m.bias.data) elif isinstance(m, nn.ConvTranspose3d): init.kaiming_normal_(m.weight.data) if m.bias is not None: init.normal_(m.bias.data) elif isinstance(m, nn.BatchNorm1d): init.normal_(m.weight.data, mean=1, std=0.02) init.constant_(m.bias.data, 0) elif isinstance(m, nn.BatchNorm2d): init.normal_(m.weight.data, mean=1, std=0.02) init.constant_(m.bias.data, 0) elif isinstance(m, nn.BatchNorm3d): init.normal_(m.weight.data, mean=1, std=0.02) init.constant_(m.bias.data, 0) elif isinstance(m, nn.Linear): init.kaiming_normal_(m.weight.data) init.normal_(m.bias.data) elif isinstance(m, nn.LSTM): for param in m.parameters(): if len(param.shape) >= 2: init.orthogonal_(param.data) else: init.normal_(param.data) elif isinstance(m, nn.LSTMCell): for param in m.parameters(): if len(param.shape) >= 2: init.orthogonal_(param.data) else: init.normal_(param.data) elif isinstance(m, nn.GRU): for param in m.parameters(): if len(param.shape) >= 2: init.orthogonal_(param.data) else: init.normal_(param.data) elif isinstance(m, nn.GRUCell): for param in m.parameters(): if len(param.shape) >= 2: init.orthogonal_(param.data) else: init.normal_(param.data)
def __init__(self): super(DummyTorchModule, self).__init__() self.test_weight = torch.nn.Parameter( init.orthogonal_(torch.Tensor(5, 5)))