def test(): config_ = BertConfig().from_json_file(MODEL_PATH + 'bert_config.json') # device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") # model.bert.load_state_dict(torch.load(args.init_checkpoint, map_location='cpu')) from transformers import BertTokenizer # tokenizer = BertTokenizer(MODEL_PATH + 'vocab.txt') # inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") # print(inputs) model_ = BertModel(config_).from_pretrained(MODEL_PATH) for name, param in model_.named_parameters(): print(name)
def get_pretrained_model(path, logger, args=None): logger.info('load pretrained model in {}'.format(path)) bert_tokenizer = BertTokenizer.from_pretrained(path) if args is None or args.hidden_layers == 12: bert_config = BertConfig.from_pretrained(path) bert_model = BertModel.from_pretrained(path) else: logger.info('load {} layers bert'.format(args.hidden_layers)) bert_config = BertConfig.from_pretrained(path, num_hidden_layers=args.hidden_layers) bert_model = BertModel(bert_config) model_param_list = [p[0] for p in bert_model.named_parameters()] load_dict = torch.load(os.path.join(path, 'pytorch_model.bin')) new_load_dict = {} for k, v in load_dict.items(): k = k.replace('bert.', '') if k in model_param_list: new_load_dict[k] = v new_load_dict['embeddings.position_ids'] = torch.tensor([i for i in range(512)]).unsqueeze(dim=0) bert_model.load_state_dict(new_load_dict) logger.info('load complete') return bert_config, bert_tokenizer, bert_model
class BertLSTM(BertPreTrainedModel): def __init__(self, config: BertConfig): super().__init__(config) self.bert = BertModel(config) self.dropout = nn.Dropout(config.hidden_dropout_prob) self.lstm = nn.LSTM(input_size=config.hidden_size, hidden_size=config.hidden_size, num_layers=1, dropout=0, batch_first=True, bidirectional=False) self.fc = nn.Linear(config.hidden_size * 3, config.num_labels) self.fc_bn = nn.BatchNorm1d(config.num_labels) self.tanh = nn.Tanh() self.init_weights() # Default: freeze bert for name, param in self.bert.named_parameters(): param.requires_grad = False # Unfreeze layers if config.unfreeze == "embed": for name, param in self.bert.named_parameters(): if "embeddings" in name: param.requires_grad = True if config.unfreeze == "embed_enc0": for name, param in self.bert.named_parameters(): if "embeddings" in name or "encoder.layer.0" in name: param.requires_grad = True if config.unfreeze == "embed_enc0_pooler": for name, param in self.bert.named_parameters(): if "embeddings" in name or "encoder.layer.0" in name or "pooler" in name: param.requires_grad = True if config.unfreeze == "enc0": for name, param in self.bert.named_parameters(): if "encoder.layer.0" in name: param.requires_grad = True if config.unfreeze == "enc0_pooler": for name, param in self.bert.named_parameters(): if "encoder.layer.0" in name or "pooler" in name: param.requires_grad = True if config.unfreeze == "embed_pooler": for name, param in self.bert.named_parameters(): if "embed" in name or "pooler" in name: param.requires_grad = True if config.unfreeze == "pooler": for name, param in self.bert.named_parameters(): if "pooler" in name: param.requires_grad = True if config.unfreeze == "enc-1": n_layer = sum([ 1 for name, _ in self.bert.named_parameters() if "encoder.layer" in name ]) last_layer = "encoder.layer." + str( int(n_layer / 16 - 1)) # each enc layer has 16 pars for name, param in self.bert.named_parameters(): if last_layer in name: param.requires_grad = True if config.unfreeze == "enc-1_pooler": n_layer = sum([ 1 for name, _ in self.bert.named_parameters() if "encoder.layer" in name ]) last_layer = "encoder.layer." + str( int(n_layer / 16 - 1)) # each enc layer has 16 pars for name, param in self.bert.named_parameters(): if last_layer in name or "pooler" in name: param.requires_grad = True def forward(self, doc): """ Input: doc: [batch_size, num_chunks, 3, max_chunk_len] Returns: out: [batch_size, output_dim] """ batch_size = doc.shape[0] pooled = self.bert(input_ids=doc[0, :, 0], attention_mask=doc[0, :, 1], token_type_ids=doc[0, :, 2])[1].unsqueeze(0) for i in range(batch_size - 1): # Output of BertModel: (last_hidden_state, pooler_output, hidden_states, attentions) # Last layer hidden-state of the first token of the sequence (classification token) pool_i = self.bert(input_ids=doc[i + 1, :, 0], attention_mask=doc[i + 1, :, 1], token_type_ids=doc[i + 1, :, 2])[1] pooled = torch.cat((pooled, pool_i.unsqueeze(0)), dim=0) dp = self.dropout(pooled) # [batch_size, num_chunks, hidden_size] # output: [batch_size, num_chunks, n_directions*hidden_size], output features from last layer for each t # h_n: [n_layers*n_directions, batch_size, hidden_size], hidden state for t=seq_len # c_n: [n_layers*n_directions, batch_size, hidden_size], cell state fir t=seq_len output, (h_n, c_n) = self.lstm(dp) # Concat pooling # h_n = output[:,-1,].squeeze(1) # [batch_size, hidden_size] h_n = h_n.squeeze(0) # [batch_size, hidden_size] h_max = torch.max(output, dim=1).values # [batch_size, hidden_size] h_mean = torch.mean(output, dim=1) # [batch_size, hidden_size] out = torch.cat((h_n, h_max, h_mean), dim=1) # [batch_size, hidden_size*3] out = self.fc(out) # [batch_size, num_labels] out = self.fc_bn(out) out = F.softmax(out, dim=1) # [batch_size, num_labels] # out = self.tanh(out) # [batch_size, num_labels] return out
class BertPoolLSTM(BertPreTrainedModel): def __init__(self, config: BertConfig): super().__init__(config) self.config = config self.bert = BertModel(config) # self.seq_summary = SequenceSummary(config) self.dropout = nn.Dropout(config.hidden_dropout_prob) self.lstm = nn.LSTM(input_size=config.hidden_size, hidden_size=config.hidden_size, num_layers=1, dropout=0, batch_first=True, bidirectional=False) self.fc = nn.Linear(config.hidden_size, config.num_labels) self.fc_bn = nn.BatchNorm1d(config.num_labels) self.init_weights() # Default: freeze bert for name, param in self.bert.named_parameters(): param.requires_grad = False # Unfreeze layers if config.unfreeze == "pooler": for name, param in self.bert.named_parameters(): if "pooler" in name: param.requires_grad = True if config.unfreeze == "enc-1": n_layer = sum([ 1 for name, _ in self.bert.named_parameters() if "encoder.layer" in name ]) last_layer = "encoder.layer." + str( int(n_layer / 16 - 1)) # each enc layer has 16 pars for name, param in self.bert.named_parameters(): if last_layer in name: param.requires_grad = True if config.unfreeze == "enc-1_pooler": n_layer = sum([ 1 for name, _ in self.bert.named_parameters() if "encoder.layer" in name ]) last_layer = "encoder.layer." + str( int(n_layer / 16 - 1)) # each enc layer has 16 pars for name, param in self.bert.named_parameters(): if last_layer in name or "pooler" in name: param.requires_grad = True def forward(self, doc): """ Input: doc: [batch_size, n_chunks, 3, max_chunk_len] n_chunks is the number of chunks within the batch (same for each doc after PadDoc) Returns: out: [batch_size, output_dim] """ batch_size = doc.shape[0] hidden_pooled_layers = [] for k in range(batch_size): # Each doc is considered as a special 'batch' and each chunk is an element of the special 'bert_batch' # n_chunks is the temporary 'bert_batch_size', max_chunk_len corresponds to 'seq_len' bert_output_k = self.bert( input_ids=doc[k, :, 0], # [n_chunks, max_chunk_len] attention_mask=doc[k, :, 1], token_type_ids=doc[k, :, 2]) # pooled_k = bert_output_k[1].unsqueeze(0) hidden_states_k = bert_output_k[ 2] # each element in the tuple: [n_chunks, max_chunk_len, hidden_size] # Average pooling over last [pool_layers] layers hidden_list_k = list(hidden_states_k[self.config.pool_layers:]) hidden_stack_k = torch.stack( hidden_list_k ) # [n_pooled_layers, n_chunks, max_chunk_len, hidden_size] hidden_pooled_layers_k = torch.mean( hidden_stack_k, dim=0) # [n_chunks, max_chunk_len, hidden_size] hidden_pooled_layers.append(hidden_pooled_layers_k) hidden_pooled_layers = torch.stack( hidden_pooled_layers ) # [batch_size, n_chunks, max_chunk_len, hidden_size] # Pooling within each chunk (over 512 word tokens of individual chunk) if self.config.pool_method == 'mean': hidden_pooled = torch.mean( hidden_pooled_layers, dim=2) # [batch_size, n_chunks, hidden_size] elif self.config.pool_method == 'max': hidden_pooled = torch.max( hidden_pooled_layers, dim=2).values # [batch_size, n_chunks, hidden_size] elif self.config.pool_method == 'mean_max': hidden_pooled_mean = torch.mean( hidden_pooled_layers, dim=2) # [batch_size, n_chunks, hidden_size] hidden_pooled_max = torch.max( hidden_pooled_layers, dim=2).values # [batch_size, n_chunks, hidden_size] hidden_pooled = torch.cat( (hidden_pooled_mean, hidden_pooled_max), dim=1) # [batch_size, n_chunks*2, hidden_size] elif self.config.pool_method == 'cls': hidden_pooled = hidden_pooled_layers[:, :, 0, :] # [batch_size, n_chunks, hidden_size] else: # pool_method is None hidden_pooled = hidden_pooled_layers.view( batch_size, -1, self.config.hidden_size ) # [batch_size, n_chunks*max_chunk_len, hidden_size] dp = self.dropout(hidden_pooled) # [batch_size, ?, hidden_size] # ? can be n_chunks, n_chunks*2 or n_chunks*max_chunk_len) # output: [batch_size, ?, n_directions*hidden_size], output features from last layer for each t # h_n: [n_layers*n_directions, batch_size, hidden_size], hidden state for t=seq_len # c_n: [n_layers*n_directions, batch_size, hidden_size], cell state fir t=seq_len output, (h_n, c_n) = self.lstm(dp) h_n = h_n.squeeze(0) # [batch_size, hidden_size] out = self.fc(h_n) # [batch_size, num_labels] out = self.fc_bn(out) out = F.softmax(out, dim=1) # [batch_size, num_labels] return out
class BertPoolConv(BertPreTrainedModel): def __init__(self, config: BertConfig): super().__init__(config) self.config = config self.bert = BertModel(config) # self.seq_summary = SequenceSummary(config) self.dropout = nn.Dropout(config.hidden_dropout_prob) self.convs = nn.ModuleList([ nn.Conv2d(in_channels=1, out_channels=config.num_filters, kernel_size=(fsize, config.hidden_size)) for fsize in config.filter_sizes ]) self.fc = nn.Linear(config.num_filters * len(config.filter_sizes), config.num_labels) self.fc_bn = nn.BatchNorm1d(config.num_labels) self.init_weights() # Default: freeze bert for name, param in self.bert.named_parameters(): param.requires_grad = False # Unfreeze layers if config.unfreeze == "pooler": for name, param in self.bert.named_parameters(): if "pooler" in name: param.requires_grad = True if config.unfreeze == "enc-1": n_layer = sum([ 1 for name, _ in self.bert.named_parameters() if "encoder.layer" in name ]) last_layer = "encoder.layer." + str( int(n_layer / 16 - 1)) # each enc layer has 16 pars for name, param in self.bert.named_parameters(): if last_layer in name: param.requires_grad = True if config.unfreeze == "enc-1_pooler": n_layer = sum([ 1 for name, _ in self.bert.named_parameters() if "encoder.layer" in name ]) last_layer = "encoder.layer." + str( int(n_layer / 16 - 1)) # each enc layer has 16 pars for name, param in self.bert.named_parameters(): if last_layer in name or "pooler" in name: param.requires_grad = True def forward(self, doc): """ Input: doc: [batch_size, n_chunks, 3, max_chunk_len] n_chunks is the number of chunks within the batch (same for each doc after PadDoc) Returns: out: [batch_size, output_dim] """ batch_size = doc.shape[0] hidden_pooled_layers = [] for k in range(batch_size): # Each doc is considered as a special 'batch' and each chunk is an element of the special 'bert_batch' # n_chunks is the temporary 'bert_batch_size', max_chunk_len corresponds to 'seq_len' bert_output_k = self.bert( input_ids=doc[k, :, 0], # [n_chunks, max_chunk_len] attention_mask=doc[k, :, 1], token_type_ids=doc[k, :, 2]) # pooled_k = bert_output_k[1].unsqueeze(0) hidden_states_k = bert_output_k[ 2] # each element in the tuple: [n_chunks, max_chunk_len, hidden_size] # Average pooling over last [pool_layers] layers hidden_list_k = list(hidden_states_k[self.config.pool_layers:]) hidden_stack_k = torch.stack( hidden_list_k ) # [n_pooled_layers, n_chunks, max_chunk_len, hidden_size] hidden_pooled_layers_k = torch.mean( hidden_stack_k, dim=0) # [n_chunks, max_chunk_len, hidden_size] hidden_pooled_layers.append(hidden_pooled_layers_k) hidden_pooled_layers = torch.stack( hidden_pooled_layers ) # [batch_size, n_chunks, max_chunk_len, hidden_size] # Pooling within each chunk (over 512 word tokens of individual chunk) if self.config.pool_method == 'mean': hidden_pooled = torch.mean( hidden_pooled_layers, dim=2) # [batch_size, n_chunks, hidden_size] elif self.config.pool_method == 'max': hidden_pooled = torch.max( hidden_pooled_layers, dim=2).values # [batch_size, n_chunks, hidden_size] elif self.config.pool_method == 'mean_max': hidden_pooled_mean = torch.mean( hidden_pooled_layers, dim=2) # [batch_size, n_chunks, hidden_size] hidden_pooled_max = torch.max( hidden_pooled_layers, dim=2).values # [batch_size, n_chunks, hidden_size] hidden_pooled = torch.cat( (hidden_pooled_mean, hidden_pooled_max), dim=1) # [batch_size, n_chunks*2, hidden_size] elif self.config.pool_method == 'cls': hidden_pooled = hidden_pooled_layers[:, :, 0, :] # [batch_size, n_chunks, hidden_size] else: # pool_method is None hidden_pooled = hidden_pooled_layers.view( batch_size, -1, self.config.hidden_size ) # [batch_size, n_chunks*max_chunk_len, hidden_size] hidden_pooled = hidden_pooled.unsqueeze( 1) # [batch_size, 1, ?, hidden_size] hidden_conved = [ F.relu(conv(hidden_pooled)) for conv in self.convs ] # hidden_conved[i]: [batch_size, n_filters, (?-fsize+1), 1] hidden_conved = [ conv.squeeze(3) for conv in hidden_conved ] # hidden_conved[i]: [batch_size, n_filters, (?-fsize+1)] hc_pooled = [ F.max_pool1d(conv, conv.shape[2]) for conv in hidden_conved ] # hc_pooled[i]: [batch_size, n_filters, 1] hc_pooled = [pool.squeeze(2) for pool in hc_pooled ] # hc_pooled[i]: [batch_size, n_filters] cat = torch.cat(hc_pooled, dim=1) # [batch_size, n_filters * len(filter_sizes)] dp = self.dropout(cat) out = self.fc(dp) # # [batch_size, num_labels] out = self.fc_bn(out) out = F.softmax(out, dim=1) # [batch_size, num_labels] return out
'hidden_act': 'gelu', 'hidden_dropout_prob': 0.1, 'hidden_size': 768, 'initializer_range': 0.02, 'intermediate_size': 3072, 'max_position_embeddings': 512, 'num_attention_heads': 12, 'num_hidden_layers': 12, 'type_vocab_size': 2, 'vocab_size': 8002 } if __name__ == "__main__": ctx = "cpu" # kobert kobert_model_file = "./kobert_resources/pytorch_kobert_2439f391a6.params" kobert_vocab_file = "./kobert_resources/kobert_news_wiki_ko_cased-ae5711deb3.spiece" bertmodel = BertModel(config=BertConfig.from_dict(bert_config)) bertmodel.load_state_dict(torch.load(kobert_model_file)) device = torch.device(ctx) bertmodel.to(device) # bertmodel.eval() # for name, param in bertmodel.named_parameters(): # print(name, param.shape) for name, param in bertmodel.named_parameters(): if param.requires_grad: print(name, param.shape)
class DocumentBert(BertPreTrainedModel): def __init__(self, bert_model_config: BertConfig): super(DocumentBert, self).__init__(bert_model_config) self.bert_patent = BertModel(bert_model_config) self.bert_tsd = BertModel(bert_model_config) for param in self.bert_patent.parameters(): param.requires_grad = False for param in self.bert_tsd.parameters(): param.requires_grad = False self.bert_batch_size = self.bert_patent.config.bert_batch_size self.dropout_patent = torch.nn.Dropout( p=bert_model_config.hidden_dropout_prob) self.dropout_tsd = torch.nn.Dropout( p=bert_model_config.hidden_dropout_prob) self.lstm_patent = torch.nn.LSTM(bert_model_config.hidden_size, bert_model_config.hidden_size) self.lstm_tsd = torch.nn.LSTM(bert_model_config.hidden_size, bert_model_config.hidden_size) self.output = torch.nn.Linear(bert_model_config.hidden_size * 2, out_features=1) def forward(self, patent_batch: torch.Tensor, tsd_batch: torch.Tensor, device='cuda'): #patent bert_output_patent = torch.zeros( size=(patent_batch.shape[0], min(patent_batch.shape[1], self.bert_batch_size), self.bert_patent.config.hidden_size), dtype=torch.float, device=device) for doc_id in range(patent_batch.shape[0]): bert_output_patent[ doc_id][:self.bert_batch_size] = self.dropout_patent( self.bert_patent( patent_batch[doc_id][:self.bert_batch_size, 0], token_type_ids=patent_batch[doc_id] [:self.bert_batch_size, 1], attention_mask=patent_batch[doc_id] [:self.bert_batch_size, 2])[1]) output_patent, (_, _) = self.lstm_patent( bert_output_patent.permute(1, 0, 2)) last_layer_patent = output_patent[-1] #tsd bert_output_tsd = torch.zeros(size=(tsd_batch.shape[0], min(tsd_batch.shape[1], self.bert_batch_size), self.bert_tsd.config.hidden_size), dtype=torch.float, device=device) for doc_id in range(tsd_batch.shape[0]): bert_output_tsd[doc_id][:self.bert_batch_size] = self.dropout_tsd( self.bert_tsd( tsd_batch[doc_id][:self.bert_batch_size, 0], token_type_ids=tsd_batch[doc_id][:self.bert_batch_size, 1], attention_mask=tsd_batch[doc_id][:self.bert_batch_size, 2])[1]) output_tsd, (_, _) = self.lstm_tsd(bert_output_tsd.permute(1, 0, 2)) last_layer_tsd = output_tsd[-1] x = torch.cat([last_layer_patent, last_layer_tsd], dim=1) prediction = torch.nn.functional.sigmoid(self.output(x)) assert prediction.shape[0] == patent_batch.shape[0] return prediction def freeze_bert_encoder(self): for param in self.bert_patent.parameters(): param.requires_grad = False for param in self.bert_tsd.parameters(): param.requires_grad = False def unfreeze_bert_encoder(self): for param in self.bert_patent.parameters(): param.requires_grad = True for param in self.bert_tsd.parameters(): param.requires_grad = True def unfreeze_bert_encoder_last_layers(self): for name, param in self.bert_patent.named_parameters(): if "encoder.layer.11" in name or "pooler" in name: param.requires_grad = True for name, param in self.bert_tsd.named_parameters(): if "encoder.layer.11" in name or "pooler" in name: param.requires_grad = True def unfreeze_bert_encoder_pooler_layer(self): for name, param in self.bert_patent.named_parameters(): if "pooler" in name: param.requires_grad = True for name, param in self.bert_tsd.named_parameters(): if "pooler" in name: param.requires_grad = True
class BertPoolConv(BertPreTrainedModel): def __init__(self, config: BertConfig): super().__init__(config) self.config = config self.bert = BertModel(config) # self.seq_summary = SequenceSummary(config) self.dropout = nn.Dropout(config.hidden_dropout_prob) self.convs = nn.ModuleList([ nn.Conv2d(in_channels=1, out_channels=config.num_filters, kernel_size=(fsize, config.hidden_size)) for fsize in config.filter_sizes ]) self.fc = nn.Linear(config.num_filters * len(config.filter_sizes), config.num_labels) self.fc_bn = nn.BatchNorm1d(config.num_labels) self.init_weights() # Default: freeze bert for name, param in self.bert.named_parameters(): param.requires_grad = False # Unfreeze layers if config.unfreeze == "pooler": for name, param in self.bert.named_parameters(): if "pooler" in name: param.requires_grad = True if config.unfreeze == "enc-1": n_layer = sum([ 1 for name, _ in self.bert.named_parameters() if "encoder.layer" in name ]) last_layer = "encoder.layer." + str( int(n_layer / 16 - 1)) # each enc layer has 16 pars for name, param in self.bert.named_parameters(): if last_layer in name: param.requires_grad = True if config.unfreeze == "enc-1_pooler": n_layer = sum([ 1 for name, _ in self.bert.named_parameters() if "encoder.layer" in name ]) last_layer = "encoder.layer." + str( int(n_layer / 16 - 1)) # each enc layer has 16 pars for name, param in self.bert.named_parameters(): if last_layer in name or "pooler" in name: param.requires_grad = True def forward(self, doc): """ Input: doc: [batch_size, n_chunks, 3, max_chunk_len] n_chunks is the number of chunks within the batch (same for each doc after PadDoc) Returns: out: [batch_size, output_dim] """ batch_size = doc.shape[0] hidden_pooled_layers = [] for k in range(batch_size): # Each doc is considered as a special 'batch' and each chunk is an element of the special 'bert_batch' # n_chunks is the temporary 'bert_batch_size', max_chunk_len corresponds to 'seq_len' bert_output_k = self.bert( input_ids=doc[k, :, 0], # [n_chunks, max_chunk_len] attention_mask=doc[k, :, 1], token_type_ids=doc[k, :, 2]) # pooled_k = bert_output_k[1].unsqueeze(0) hidden_states_k = bert_output_k[ 2] # each element in the tuple: [n_chunks, max_chunk_len, hidden_size] # Average pooling over last [pool_layers] layers hidden_list_k = list(hidden_states_k[self.config.pool_layers:]) hidden_stack_k = torch.stack( hidden_list_k ) # [n_pooled_layers, n_chunks, max_chunk_len, hidden_size] hidden_pooled_layers_k = torch.mean( hidden_stack_k, dim=0) # [n_chunks, max_chunk_len, hidden_size] hidden_pooled_layers.append(hidden_pooled_layers_k) hidden_pooled_layers = torch.stack( hidden_pooled_layers ) # [batch_size, n_chunks, max_chunk_len, hidden_size] # Pooling within each chunk (over 512 word tokens of individual chunk) if self.config.pool_method == 'mean': hidden_pooled = torch.mean( hidden_pooled_layers, dim=2) # [batch_size, n_chunks, hidden_size] elif self.config.pool_method == 'max': hidden_pooled = torch.max( hidden_pooled_layers, dim=2).values # [batch_size, n_chunks, hidden_size] elif self.config.pool_method == 'mean_max': hidden_pooled_mean = torch.mean( hidden_pooled_layers, dim=2) # [batch_size, n_chunks, hidden_size] hidden_pooled_max = torch.max( hidden_pooled_layers, dim=2).values # [batch_size, n_chunks, hidden_size] hidden_pooled = torch.cat( (hidden_pooled_mean, hidden_pooled_max), dim=1) # [batch_size, n_chunks*2, hidden_size] elif self.config.pool_method == 'cls': hidden_pooled = hidden_pooled_layers[:, :, 0, :] # [batch_size, n_chunks, hidden_size] else: # pool_method is None hidden_pooled = hidden_pooled_layers.view( batch_size, -1, self.config.hidden_size ) # [batch_size, n_chunks*max_chunk_len, hidden_size] hidden_pooled = hidden_pooled.unsqueeze( 1) # [batch_size, 1, ?, hidden_size] hidden_conved = [ F.relu(conv(hidden_pooled)) for conv in self.convs ] # hidden_conved[i]: [batch_size, n_filters, (?-fsize+1), 1] hidden_conved = [ conv.squeeze(3) for conv in hidden_conved ] # hidden_conved[i]: [batch_size, n_filters, (?-fsize+1)] hc_pooled = [ F.max_pool1d(conv, conv.shape[2]) for conv in hidden_conved ] # hc_pooled[i]: [batch_size, n_filters, 1] hc_pooled = [pool.squeeze(2) for pool in hc_pooled ] # hc_pooled[i]: [batch_size, n_filters] cat = torch.cat(hc_pooled, dim=1) # [batch_size, n_filters * len(filter_sizes)] dp = self.dropout(cat) out = self.fc(dp) # # [batch_size, num_labels] out = self.fc_bn(out) out = F.softmax(out, dim=1) # [batch_size, num_labels] return out #%% # class AlbertLinear(AlbertPreTrainedModel): # def __init__(self, config: AlbertConfig): # super().__init__(config) # self.albert = AlbertModel(config) # self.dropout = nn.Dropout(config.hidden_dropout_prob) # self.fc = nn.Linear(config.hidden_size, config.num_labels) # self.fc_bn = nn.BatchNorm1d(config.num_labels) # # self.fc = nn.Linear(config.hidden_size * config.n_chunks, config.num_labels) # self.init_weights() # # Default: freeze albert # for name, param in self.albert.named_parameters(): # param.requires_grad = False # # Unfreeze layers # if config.unfreeze == "pooler": # for name, param in self.albert.named_parameters(): # if "pooler" in name: # param.requires_grad = True # def forward(self, doc): # """ # Input: # doc: [batch_size, num_chunks, 3, max_chunk_len] # Returns: # out: [batch_size, output_dim] # """ # batch_size = doc.shape[0] # pooled = self.albert(input_ids = doc[0,:,0], # attention_mask = doc[0,:,1], # token_type_ids = doc[0,:,2])[1].unsqueeze(0) # for i in range(batch_size-1): # pool_i = self.albert(input_ids = doc[i+1,:,0], # attention_mask = doc[i+1,:,1], # token_type_ids = doc[i+1,:,2])[1] # pooled = torch.cat((pooled, pool_i.unsqueeze(0)), dim=0) # dp = self.dropout(pooled) # [batch_size, num_chunks, hidden_size] # # concat = dp.view(batch_size, -1) # [batch_size, num_chunks*hidden_size] # if self.albert.config.linear_max == True: # dp = torch.max(dp, dim=1).values # [batch_size, hidden_size] # else: # dp = torch.mean(dp, dim=1) # [batch_size, hidden_size] # # dp = dp.sum(dim=1) # [batch_size, hidden_size] # out = self.fc(dp) # [batch_size, num_labels] # out = self.fc_bn(out) # out = F.softmax(out, dim=1) # [batch_size, num_labels] # return out #%% # class AlbertLSTM(AlbertPreTrainedModel): # def __init__(self, config: AlbertConfig): # super().__init__(config) # self.albert = AlbertModel(config) # self.dropout = nn.Dropout(config.hidden_dropout_prob) # self.lstm = nn.LSTM(input_size = config.hidden_size, hidden_size = config.hidden_size, # num_layers = 1, dropout = 0, # batch_first = True, bidirectional = False) # self.fc = nn.Linear(config.hidden_size, config.num_labels) # self.fc_bn = nn.BatchNorm1d(config.num_labels) # self.tanh = nn.Tanh() # self.init_weights() # # Default: freeze albert # for name, param in self.albert.named_parameters(): # param.requires_grad = False # # Unfreeze layers # if config.unfreeze == "embed": # for name, param in self.albert.named_parameters(): # if "embeddings" in name: # param.requires_grad = True # if config.unfreeze == "embed_enc0": # for name, param in self.albert.named_parameters(): # if "embeddings" in name or "encoder" in name: # param.requires_grad = True # if config.unfreeze == "embed_enc0_pooler": # for name, param in self.albert.named_parameters(): # param.requires_grad = True # if config.unfreeze == "enc0": # for name, param in self.albert.named_parameters(): # if "encoder" in name: # param.requires_grad = True # if config.unfreeze == "enc0_pooler": # for name, param in self.albert.named_parameters(): # if "encoder" in name or "pooler" in name: # param.requires_grad = True # if config.unfreeze == "embed_pooler": # for name, param in self.albert.named_parameters(): # if "embed" in name or "pooler" in name: # param.requires_grad = True # if config.unfreeze == "pooler": # for name, param in self.albert.named_parameters(): # if "pooler" in name: # param.requires_grad = True # def forward(self, doc): # """ # Input: # doc: [batch_size, num_chunks, 3, max_chunk_len] # Returns: # out: [batch_size, output_dim] # """ # batch_size = doc.shape[0] # pooled = self.albert(input_ids = doc[0,:,0], # attention_mask = doc[0,:,1], # token_type_ids = doc[0,:,2])[1].unsqueeze(0) # for i in range(batch_size-1): # # Output of BertModel: (last_hidden_state, pooler_output, hidden_states, attentions) # # Last layer hidden-state of the first token of the sequence (classification token) # pool_i = self.albert(input_ids = doc[i+1,:,0], # attention_mask = doc[i+1,:,1], # token_type_ids = doc[i+1,:,2])[1] # pooled = torch.cat((pooled, pool_i.unsqueeze(0)), dim=0) # dp = self.dropout(pooled) # [batch_size, num_chunks, bert_hidden_size] # # output: [batch_size, num_chunks, n_directions*hidden_size], output features from last layer for each t # # h_n: [n_layers*n_directions, batch_size, hidden_size], hidden state for t=seq_len # # c_n: [n_layers*n_directions, batch_size, hidden_size], cell state fir t=seq_len # output, (h_n, c_n) = self.lstm(dp) # # h_n = output[:,-1,].squeeze(1) # [batch_size, hidden_size] # h_n = h_n.squeeze(0) # [batch_size, hidden_size] # out = self.fc(h_n) # [batch_size, num_labels] # out = self.fc_bn(out) # out = F.softmax(out, dim=1) # [batch_size, num_labels] # # out = self.tanh(out) # [batch_size, num_labels] # return out
for idx in range(len(attentions)): output["bert_layer" + str(idx + 1)] = { "hidden_states": hidden_states[idx + 1], "attention": attentions[idx] } output["pred_layer"] = {"pooler_output": pooler_output} return output # loss_model loss_model = MultiLayerBasedDistillationLoss( distill_config=distill_config, teacher_output_adaptor=output_adaptor, student_output_adaptor=output_adaptor) # optimizer param_optimizer = list(student_model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = torch.optim.Adam(params=optimizer_grouped_parameters, lr=learning_rate) # evaluator
class BertBinaryClassification(BertPreTrainedModel): def __init__(self, config): """ :param config: a transformers Config object """ self.num_labels = 1 super().__init__(config, num_labels=self.num_labels) # The BertModel without pooling layer self.bert = BertModel(config, add_pooling_layer=False) self.dropout = nn.Dropout(config.hidden_dropout_prob) # Input dimension depending on number of tokens encodings input_dim = config.hidden_size * 3 cls_layers = [] cls_layers.append(nn.Linear(input_dim, config.hidden_size)) cls_layers.append(nn.GELU()) cls_layers.append( nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)) cls_layers.append(nn.Linear(config.hidden_size, self.num_labels)) # The classifier: self.classifier = nn.Sequential(*cls_layers) # Used in BCEWithLogitsLoss function # to counteract unbalanced training sets # Can be changed with self.set_class_weights self.class_weights = torch.ones([1]) self.init_weights() def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, labels=None, output_attentions=None, output_hidden_states=None, return_dict=None): outputs = self.bert(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict) # Use either pooled output or sequence output, depending on settings bert_output = outputs[0] # Get the state of the [CLS] token, # and the first token of the mention and candidate entity # Position of the first token of the candidate # (right after the [SEP] token) cand_pos = torch.argmax(token_type_ids, dim=1) # Get the embedding of the first token of the candidate over the batch cand_tensors = torch.cat([t[i] for t, i in zip(bert_output, cand_pos) ]).reshape((bert_output.size(0), bert_output.size(-1))) # Flattened input of 3 * hidden_size features bert_output = torch.cat( [bert_output[:, 0], bert_output[:, 1], cand_tensors], dim=1) bert_output = self.dropout(bert_output) logits = self.classifier(bert_output) loss = None if labels is not None: # Binary cross entropy loss with class weights and sigmoid loss_fn = nn.BCEWithLogitsLoss(pos_weight=self.class_weights) loss = loss_fn(logits.view(-1), labels.view(-1).to(dtype=torch.float)) if not return_dict: output = (logits, ) + outputs[2:] return ((loss, ) + output) if loss is not None else output return SequenceClassifierOutput( loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) def freeze_layers(self, param_idx: List): """ Freeze layers at provided indices to not train them :param param_idx: list of indices of layers to be frozen """ module_params = list(self.named_parameters()) for param in (module_params[p] for p in param_idx): param[1].requires_grad = False def freeze_n_transformers(self, n: int = 11): """ Freeze the provided number of encoders in the BERT architecture to not train them :param n: number of encoders to freeze """ n = min(n, 12) n_emb_layers = 5 n_layers_in_transformer = 12 emb_layers = list(range(n_emb_layers)) encoder_layers = list( range(n_emb_layers, n_emb_layers + n * n_layers_in_transformer)) self.freeze_layers(emb_layers + encoder_layers) def freeze_bert(self): """ Freezes all layers in BERT from training """ for param in self.bert.named_parameters(): param[1].requires_grad = False def set_class_weights(self, class_weights): """ Set the self.class_weights used to penalize wrong prediction of minority class :param class_weights: a pytorch tensor with weights over the two classes. Used by the CrossEntropyLoss. """ self.class_weights = class_weights
model = BertModel(configuration) # Accessing the model configuration configuration = model.config print(configuration) # Loading the Hugging Face Bert Uncased Base Model model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2) model.cuda() # Optimizer Grouped Parameters # Don't apply weight decay to any parameters whose names include these tokens. # (Here, the BERT doesn't have `gamma` or `beta` parameters, only `bias` terms) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.weight'] # Separate the `weight` parameters from the `bias` parameters. # - For the `weight` parameters, this specifies a 'weight_decay_rate' of 0.01. # - For the `bias` parameters, the 'weight_decay_rate' is 0.0. optimizer_grouped_parameters = [ # Filter for all parameters which *don't* include 'bias', 'gamma', 'beta'. { 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.1 }, # Filter for parameters which *do* include those. {