def create_and_check_bert_model_as_decoder(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels, encoder_hidden_states, encoder_attention_mask): model = BertModel(config) model.to(torch_device) model.eval() sequence_output, pooled_output = model( input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_attention_mask) sequence_output, pooled_output = model( input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, encoder_hidden_states=encoder_hidden_states) sequence_output, pooled_output = model( input_ids, attention_mask=input_mask, token_type_ids=token_type_ids) result = { "sequence_output": sequence_output, "pooled_output": pooled_output, } self.parent.assertListEqual( list(result["sequence_output"].size()), [self.batch_size, self.seq_length, self.hidden_size]) self.parent.assertListEqual(list(result["pooled_output"].size()), [self.batch_size, self.hidden_size])
def create_and_check_model_as_decoder( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels, encoder_hidden_states, encoder_attention_mask, ): config.add_cross_attention = True model = BertModel(config) model.to(torch_device) model.eval() result = model( input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_attention_mask, ) result = model( input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, encoder_hidden_states=encoder_hidden_states, ) result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids) self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
def get_kobert_model(model_file, vocab_file, ctx="cpu"): bertmodel = BertModel(config=BertConfig.from_dict(bert_config)) bertmodel.load_state_dict(torch.load(model_file), strict=False) device = torch.device(ctx) bertmodel.to(device) bertmodel.eval() vocab_b_obj = nlp.vocab.BERTVocab.from_json(open(vocab_file, 'rt').read()) return bertmodel, vocab_b_obj
def __init__( self, pretrained_model_name=None, config_filename=None, vocab_size=None, hidden_size=768, num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072, hidden_act="gelu", max_position_embeddings=512, ): super().__init__() # Check that only one of pretrained_model_name, config_filename, and # vocab_size was passed in total = 0 if pretrained_model_name is not None: total += 1 if config_filename is not None: total += 1 if vocab_size is not None: total += 1 if total != 1: raise ValueError( "Only one of pretrained_model_name, vocab_size, " + "or config_filename should be passed into the " + "BERT constructor.") # TK: The following code checks the same once again. if vocab_size is not None: config = BertConfig( vocab_size_or_config_json_file=vocab_size, vocab_size=vocab_size, hidden_size=hidden_size, num_hidden_layers=num_hidden_layers, num_attention_heads=num_attention_heads, intermediate_size=intermediate_size, hidden_act=hidden_act, max_position_embeddings=max_position_embeddings, ) model = BertModel(config) elif pretrained_model_name is not None: model = BertModel.from_pretrained(pretrained_model_name) elif config_filename is not None: config = BertConfig.from_json_file(config_filename) model = BertModel(config) else: raise ValueError( "Either pretrained_model_name or vocab_size must" + " be passed into the BERT constructor") model.to(self._device) self.add_module("bert", model) self.config = model.config self._hidden_size = model.config.hidden_size
def __init__(self, *, pretrained_model_name=None, config_filename=None, vocab_size=None, hidden_size=768, num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072, hidden_act="gelu", max_position_embeddings=512, **kwargs): TrainableNM.__init__(self, **kwargs) # Check that only one of pretrained_model_name, config_filename, and # vocab_size was passed in total = 0 if pretrained_model_name is not None: total += 1 if config_filename is not None: total += 1 if vocab_size is not None: total += 1 if total != 1: raise ValueError( "Only one of pretrained_model_name, vocab_size, " + "or config_filename should be passed into the " + "BERT constructor.") if vocab_size is not None: config = BertConfig( vocab_size_or_config_json_file=vocab_size, vocab_size=vocab_size, hidden_size=hidden_size, num_hidden_layers=num_hidden_layers, num_attention_heads=num_attention_heads, intermediate_size=intermediate_size, hidden_act=hidden_act, max_position_embeddings=max_position_embeddings, ) model = BertModel(config) elif pretrained_model_name is not None: model = BertModel.from_pretrained(pretrained_model_name) elif config_filename is not None: config = BertConfig.from_json_file(config_filename) model = BertModel(config) else: raise ValueError( "Either pretrained_model_name or vocab_size must" + " be passed into the BERT constructor") model.to(self._device) self.add_module("bert", model) self.config = model.config for key, value in self.config.to_dict().items(): self._local_parameters[key] = value
def get_kobert_model(model_file, vocab_file, ctx="cpu"): bertmodel = BertModel(config=BertConfig.from_dict(bert_config)) bertmodel.load_state_dict(torch.load(model_file)) device = torch.device(ctx) bertmodel.to(device) bertmodel.eval() vocab_b_obj = nlp.vocab.BERTVocab.from_sentencepiece(vocab_file, padding_token='[PAD]') return bertmodel, vocab_b_obj
def create_and_check_bert_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels): model = BertModel(config=config) model.to(input_ids.device) model.eval() sequence_output, pooled_output = model( input_ids, attention_mask=input_mask, token_type_ids=token_type_ids) # failed because there is not loss output model_desc = ModelDescription([ self.input_ids_desc, self.attention_mask_desc, self.token_type_ids_desc ], [self.last_hidden_state_desc, self.pooler_output_desc]) args_gradient_accumulation_steps = 8 args_local_rank = 0 args_world_size = 1 args_fp16 = True args_allreduce_post_accumulation = True model = ORTTrainer( model, None, model_desc, "LambOptimizer", map_optimizer_attributes=map_optimizer_attributes, learning_rate_description=IODescription( 'Learning_Rate', [ 1, ], torch.float32), device=self.device, postprocess_model=postprocess_model, gradient_accumulation_steps=args_gradient_accumulation_steps, world_rank=args_local_rank, world_size=args_world_size, use_mixed_precision=True if args_fp16 else False, allreduce_post_accumulation=True if args_allreduce_post_accumulation else False) sequence_output, pooled_output = model( input_ids, token_type_ids=token_type_ids) sequence_output, pooled_output = model(input_ids) result = { "sequence_output": sequence_output, "pooled_output": pooled_output, } self.parent.assertListEqual( list(result["sequence_output"].size()), [self.batch_size, self.seq_length, self.hidden_size]) self.parent.assertListEqual(list(result["pooled_output"].size()), [self.batch_size, self.hidden_size])
def __init__(self, bert: BertModel, tokenizer: BertTokenizer, hypernym_list: Union[str, Path, List[List[str]]], embed_with_encoder_output: bool = True, embed_wo_special_tokens: bool = True, use_projection: bool = False, batch_size: int = 128): super(HyBert, self).__init__() self.bert = bert.to(device) if not isinstance(hypernym_list, (list, dict)): hypernym_list = self._read_hypernym_list(hypernym_list) self.tokenizer = tokenizer self.hypernym_list = hypernym_list self.use_projection = use_projection print(f"Building matrix of hypernym embeddings.") self.hypernym_embeddings = \ torch.nn.Parameter(self._build_hypernym_matrix(hypernym_list, embed_with_encoder_output, embed_wo_special_tokens, batch_size)) if self.use_projection: self.projection = nn.Linear(768, 768)
def create_and_check_bert_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels): model = BertModel(config=config) model.to(torch_device) model.eval() result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids) result = model(input_ids, token_type_ids=token_type_ids) result = model(input_ids) self.parent.assertEqual( result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
def __init__(self, model: BertModel, tokenizer: TokenizerWrapper, device=None): self.device = torch.device(device if device is not None else "cuda" if torch.cuda.is_available() else "cpu") self.model = model.to(self.device) self.tokenizer = tokenizer
def _calculate_token_embeddings(self, df: pd.DataFrame, embedder: BertModel): all_embeddings = [] for id, sentence_df in tqdm(df.groupby(SENT_ID), desc='Creating Bert Embeddings', unit='sentence'): tokens_list = list( sentence_df.groupby(TOKEN_ID).first()[TOKEN_STR]) sentence = ' '.join(tokens_list) input_ids = self.tokenizer.encode(sentence) input_ids = torch.tensor(input_ids).unsqueeze(0).to( GPU_ID) # .to('cpu') token_embeddings = self._creat_embeddings(input_ids).to('cpu') sent_emb = self._untokenize(tokens_list, token_embeddings) # sent_emb = self._untokenize_bpe(tokens_list, token_embeddings) all_embeddings.append(sent_emb.data.numpy()) embedder.to('cpu') return all_embeddings
def get_bert(BERT_PT_PATH, bert_type, do_lower_case, no_pretraining): bert_config_file = os.path.join(BERT_PT_PATH, f'bert_config_{bert_type}.json') vocab_file = os.path.join(BERT_PT_PATH, f'vocab_{bert_type}.txt') init_checkpoint = os.path.join(BERT_PT_PATH, f'pytorch_model_{bert_type}.bin') bert_config = BertConfig.from_json_file(bert_config_file) tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case) bert_config.print_status() model_bert = BertModel(bert_config) if no_pretraining: pass else: model_bert.load_state_dict( torch.load(init_checkpoint, map_location='cpu')) print("Load pre-trained parameters.") model_bert.to(device) return model_bert, tokenizer, bert_config
def train(config, bert_config, train_path, dev_path, rel2id, id2rel, tokenizer): if os.path.exists(config.output_dir) is False: os.makedirs(config.output_dir, exist_ok=True) if os.path.exists('./data/train_file.pkl'): train_data = pickle.load(open("./data/train_file.pkl", mode='rb')) else: train_data = data.load_data(train_path, tokenizer, rel2id, num_rels) pickle.dump(train_data, open("./data/train_file.pkl", mode='wb')) dev_data = json.load(open(dev_path)) for sent in dev_data: data.to_tuple(sent) data_manager = data.SPO(train_data) train_sampler = RandomSampler(data_manager) train_data_loader = DataLoader(data_manager, sampler=train_sampler, batch_size=config.batch_size, drop_last=True) num_train_steps = int( len(data_manager) / config.batch_size) * config.max_epoch if config.bert_pretrained_model is not None: logger.info('load bert weight') Bert_model = BertModel.from_pretrained(config.bert_pretrained_model, config=bert_config) else: logger.info('random initialize bert model') Bert_model = BertModel(config=bert_config).init_weights() Bert_model.to(device) submodel = sub_model(config).to(device) objmodel = obj_model(config).to(device) loss_fuc = nn.BCELoss(reduction='none') params = list(Bert_model.parameters()) + list( submodel.parameters()) + list(objmodel.parameters()) optimizer = AdamW(params, lr=config.lr) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(data_manager)) logger.info(" Num Epochs = %d", config.max_epoch) logger.info(" Total train batch size = %d", config.batch_size) logger.info(" Total optimization steps = %d", num_train_steps) logger.info(" Logging steps = %d", config.print_freq) logger.info(" Save steps = %d", config.save_freq) global_step = 0 Bert_model.train() submodel.train() objmodel.train() for _ in range(config.max_epoch): optimizer.zero_grad() epoch_itorator = tqdm(train_data_loader, disable=None) for step, batch in enumerate(epoch_itorator): batch = tuple(t.to(device) for t in batch) input_ids, segment_ids, input_masks, sub_positions, sub_heads, sub_tails, obj_heads, obj_tails = batch bert_output = Bert_model(input_ids, input_masks, segment_ids)[0] pred_sub_heads, pred_sub_tails = submodel( bert_output) # [batch_size, seq_len, 1] pred_obj_heads, pred_obj_tails = objmodel(bert_output, sub_positions) # 计算loss mask = input_masks.view(-1) # loss1 sub_heads = sub_heads.unsqueeze(-1) # [batch_szie, seq_len, 1] sub_tails = sub_tails.unsqueeze(-1) loss1_head = loss_fuc(pred_sub_heads, sub_heads).view(-1) loss1_head = torch.sum(loss1_head * mask) / torch.sum(mask) loss1_tail = loss_fuc(pred_sub_tails, sub_tails).view(-1) loss1_tail = torch.sum(loss1_tail * mask) / torch.sum(mask) loss1 = loss1_head + loss1_tail # loss2 loss2_head = loss_fuc(pred_obj_heads, obj_heads).view(-1, obj_heads.shape[-1]) loss2_head = torch.sum( loss2_head * mask.unsqueeze(-1)) / torch.sum(mask) loss2_tail = loss_fuc(pred_obj_tails, obj_tails).view(-1, obj_tails.shape[-1]) loss2_tail = torch.sum( loss2_tail * mask.unsqueeze(-1)) / torch.sum(mask) loss2 = loss2_head + loss2_tail # optimize loss = loss1 + loss2 loss.backward() optimizer.step() optimizer.zero_grad() global_step += 1 if (global_step + 1) % config.print_freq == 0: logger.info( "epoch : {} step: {} #### loss1: {} loss2: {}".format( _, global_step + 1, loss1.cpu().item(), loss2.cpu().item())) if (global_step + 1) % config.eval_freq == 0: logger.info("***** Running evaluating *****") with torch.no_grad(): Bert_model.eval() submodel.eval() objmodel.eval() P, R, F1 = utils.metric(Bert_model, submodel, objmodel, dev_data, id2rel, tokenizer) logger.info(f'precision:{P}\nrecall:{R}\nF1:{F1}') Bert_model.train() submodel.train() objmodel.train() if (global_step + 1) % config.save_freq == 0: # Save a trained model model_name = "pytorch_model_%d" % (global_step + 1) output_model_file = os.path.join(config.output_dir, model_name) state = { 'bert_state_dict': Bert_model.state_dict(), 'subject_state_dict': submodel.state_dict(), 'object_state_dict': objmodel.state_dict(), } torch.save(state, output_model_file) model_name = "pytorch_model_last" output_model_file = os.path.join(config.output_dir, model_name) state = { 'bert_state_dict': Bert_model.state_dict(), 'subject_state_dict': submodel.state_dict(), 'object_state_dict': objmodel.state_dict(), } torch.save(state, output_model_file)
class UnStructuredModel: def __init__(self, model_name, max_length, stride): self.model_name = model_name self.tokenizer = None self.model = None self.max_length = max_length self.stride = stride if model_name == 'bert-base-uncased': configuration = BertConfig() self.tokenizer = BertTokenizer.from_pretrained(self.model_name) self.model = BertModel(configuration).from_pretrained(self.model_name) self.model.to(device) self.model.eval() for param in self.model.parameters(): param.requires_grad = False #self.model.bert.embeddings.requires_grad = False def padTokens(self, tokens): if len(tokens)<self.max_length: tokens = tokens + ["[PAD]" for i in range(self.max_length - len(tokens))] return tokens def getEmbedding(self, text, if_pool=True, pooling_type="mean", batchsize = 1): tokens = self.tokenizer.tokenize(text) tokenized_array = self.tokenizeText(tokens) embeddingTensorsList = [] print(len(tokenized_array)) tensor = torch.zeros([1, 768], device=device) count = 0 if len(tokenized_array)>batchsize: for i in range(0, len(tokenized_array), batchsize): current_tokens = tokenized_array[i:min(i+batchsize,len(tokenized_array))] token_ids = torch.tensor(current_tokens).to(device) seg_ids=[[0 for _ in range(len(tokenized_array[0]))] for _ in range(len(current_tokens))] seg_ids = torch.tensor(seg_ids).to(device) hidden_reps, cls_head = self.model(token_ids, token_type_ids = seg_ids) cls_head.to(device) clas_head = cls_head.detach if if_pool and pooling_type=="mean": tensor = tensor.add(torch.sum(cls_head, dim=0)) count +=cls_head.shape[0] else: embeddingTensorsList.append(cls_head) del cls_head, hidden_reps if if_pool and pooling_type=="mean" and count>0: embedding = torch.div(tensor, count) elif not if_pool: embedding = torch.cat(embeddingTensorsList, dim=0) else: raise NotImplementedError() else: token_ids = torch.tensor(tokenized_array).to(device) seg_ids=[[0 for _ in range(len(tokenized_array[0]))] for _ in range(len(tokenized_array))] seg_ids = torch.tensor(seg_ids).to(device) hidden_reps, cls_head = self.model(token_ids, token_type_ids = seg_ids) cls_head.to(device) cls_head.requires_grad = False if if_pool and pooling_type=="mean": embedding = torch.div(torch.sum(cls_head, dim=0), cls_head.shape[0]) elif not if_pool: embedding = cls_head else: raise NotImplementedError() del cls_head, hidden_reps return embedding def tokenizeText(self, tokens): tokens_array = [] #window_movement_tokens = max_length - stride for i in range(0, len(tokens), self.stride): if i+self.max_length<len(tokens): curr_tokens = ["[CLS]"] + tokens[i:i+self.max_length] + ["[SEP]"] else: padded_tokens = self.padTokens(tokens[i:i+self.max_length]) curr_tokens = ["[CLS]"] + padded_tokens + ["[SEP]"] curr_tokens = self.tokenizer.convert_tokens_to_ids(curr_tokens) tokens_array.append(curr_tokens) return tokens_array
class BertVisdEmbedding(nn.Module): ''' The layer of generate Bert contextual representation ''' def __init__(self, config=None, device=t.device("cpu")): ''' Args: @config: configuration file of internal Bert layer ''' super(BertVisdEmbedding, self).__init__() if config is None: self.bert = BertModel.from_pretrained('bert-base-uncased') else: self.bert = BertModel(config=config) # transformers correspondence self.device = device self.bert_hidden_size = self.bert.config.hidden_size tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') self.CLS = tokenizer.convert_tokens_to_ids( ['[CLS]'])[0] #ID of the Bert [CLS] token self.SEP = tokenizer.convert_tokens_to_ids( ['[SEP]'])[0] #ID of the Bert [SEP] token self.PAD = tokenizer.convert_tokens_to_ids( ['[PAD]'])[0] #ID of the Bert [PAD] token def make_bert_input(self, content_idxs, content_type, seg_ids): ''' Args: @content_idxs (tensor): Bert IDs of the content. (batch_size, max_seq_len) Note that the max_seq_len is a fixed number due to padding/clamping policy. @content_type (str): whether the content is "question", "history" or "answer". @the initial segment ID: for "question" and "answer", this should be None; for 'history', this is should be well-initialized [0,..,0,1,...,1]. Return: cmp_idx (tensor): [CLS] context_idxs [SEP]. (batch_size, max_seq_len+2) segment_ids (tensor): for "question" and "answer", this should be "1,1,...,1"; for "history", this should be "seg_ids[0], seg_ids, seg_ids[-1]". (batch_size, max_seq_len+2) input_mask (tensor): attention of the real token in content. Note [CLS] and [SEP] are count as real token. (batch_size, q_len + ctx_len + 2) ''' mask = content_idxs != self.PAD #get the mask indicating the non-padding tokens in the content if content_type == 'question' or content_type == 'answer': #question/answer type seg_ids = t.zeros_like(content_idxs, dtype=content_idxs.dtype, device=content_idxs.device) seq_len = mask.sum(dim=1) #(batch_size, ) length of each sequence batch_size, _ = content_idxs.size() content_idxs = t.cat( (content_idxs, t.tensor([[self.PAD]] * batch_size, device=content_idxs.device)), dim=1) #(batch_size, max_seq_len+1) content_idxs[ t.arange(0, batch_size), seq_len] = self.SEP #append [SEP] token to obtain "content_idxs [SEP]" seg_last = seg_ids[t.arange(0, batch_size), seq_len - 1] #get the last segment id of each sequence seg_ids = t.cat( (seg_ids, t.tensor([[0]] * batch_size, device=content_idxs.device)), dim=1) #(batch_size, max_seq_len+1) seg_ids[t.arange(0, batch_size), seq_len] = seg_last #the segment id of the new appended [SEP] content_idxs = t.cat( (t.tensor([[self.CLS]] * batch_size, device=content_idxs.device), content_idxs), dim=1 ) #(batch_size, max_seq_len+2)append [CLS] token to obtain "[CLS] content_idxs [SEP]" seg_ids = t.cat( (seg_ids[:, 0].view(-1, 1), seg_ids), dim=1 ) #(batch_size, max_seq_len+2) extend the first column of the segment id input_mask = (content_idxs != self.PAD).long() #(batch_size, max_seq_len+2) return content_idxs, seg_ids, input_mask def parse_bert_output(self, bert_output, orig_PAD_mask): ''' Args: @bert_output (tensor): Bert output with [CLS] and [SEP] embeddings. (batch_size, 1+max_seq_len+1, bert_hidden_size) @orig_PAD_mask (tensor): 1 for PAD token, 0 for non-PAD token. (batch_size, max_seq_len) Return: bert_enc (tensor): Bert output without [CLS] and [SEP] embeddings, and with zero-embedding for all PAD tokens. (batch_size, max_seq_len, bert_hidden_size) ''' bert_enc = bert_output[:, 1: -1] #(batch_size, max_seq_len, bert_hidden_size) pad_emb = t.zeros( self.bert_hidden_size, device=bert_output.device ) #manually set the embedding of PAD token to be zero #print(bert_enc.size(), orig_PAD_mask.size(), pad_emb.size(), bert_enc.device, orig_PAD_mask.device, pad_emb.device) bert_enc = bert_enc.contiguous() bert_enc[ orig_PAD_mask] = pad_emb #set the PAD token embeddings to be zero. return bert_enc def forward(self, content_idxs, content_type, seg_ids=None): ''' Args: @content_idxs (tensor): Bert IDs of the contents. (batch_size, max_seq_len) Note that the max_seq_len is a fixed number due to padding/clamping policy @content_type (str): whether the tensor is "question", "history" or "answer" Return: bert_ctx_emb (tensor): contextual embedding condition on question. (batch_size, max_seq_len, bert_hidden_size) ''' orig_PAD_mask = content_idxs == self.PAD cmp_idxs, segment_ids, bert_att = self.make_bert_input( content_idxs, content_type, seg_ids) outputs = self.bert(cmp_idxs, segment_ids, bert_att) bert_output = outputs[0] bert_enc = self.parse_bert_output(bert_output, orig_PAD_mask) return bert_enc def train(self, mode=True): ''' Specifically set self.bert into training mode ''' self.training = mode self.bert.train(mode) return self def eval(self): ''' Specifically set self.bert into evaluation mode ''' return self.train(False) def to(self, *args, **kwargs): ''' Override to() interface. ''' print("bert emd to() called!") self = super().to(*args, **kwargs) self.bert = self.bert.to(*args, **kwargs) return self
'hidden_act': 'gelu', 'hidden_dropout_prob': 0.1, 'hidden_size': 768, 'initializer_range': 0.02, 'intermediate_size': 3072, 'max_position_embeddings': 512, 'num_attention_heads': 12, 'num_hidden_layers': 12, 'type_vocab_size': 2, 'vocab_size': 8002 } if __name__ == "__main__": ctx = "cpu" # kobert kobert_model_file = "./kobert_resources/pytorch_kobert_2439f391a6.params" kobert_vocab_file = "./kobert_resources/kobert_news_wiki_ko_cased-ae5711deb3.spiece" bertmodel = BertModel(config=BertConfig.from_dict(bert_config)) bertmodel.load_state_dict(torch.load(kobert_model_file)) device = torch.device(ctx) bertmodel.to(device) # bertmodel.eval() # for name, param in bertmodel.named_parameters(): # print(name, param.shape) for name, param in bertmodel.named_parameters(): if param.requires_grad: print(name, param.shape)
from transformers import BertConfig, BertModel if args.size == 'tiny': cur_dir = os.path.dirname(os.path.abspath(__file__)) bert_name_or_path = os.path.join(os.path.join(cur_dir, 'bert'), 'bert-tiny-uncased-config.json') elif args.size == 'base': bert_name_or_path = "bert-base-uncased" else: bert_name_or_path = "bert-large-uncased" config = BertConfig.from_pretrained(bert_name_or_path) model = BertModel(config) model.eval() device = torch.device("cpu") model.to(device) dummy_input0 = torch.LongTensor(1, 512).fill_(1).to(device) dummy_input1 = torch.LongTensor(1, 512).fill_(1).to(device) dummy_input2 = torch.LongTensor(1, 512).fill_(0).to(device) dummy_input = (dummy_input0, dummy_input1, dummy_input2) output_path = './bert/bert_{}.onnx'.format(args.size) torch.onnx.export(model, dummy_input, output_path, export_params=True, opset_version=12, do_constant_folding=True, input_names=["input_ids", "input_mask", "segment_ids"], output_names=["output"], dynamic_axes={ 'input_ids': {
class Embedder(): def __init__(self, vectorizer=None, tokenizer=None, dim_embed=200): """ :param tokenizer: KB """ self.vectorizer = vectorizer self.tokenizer = tokenizer self.pre_trained = pre_trained = vectorizer.pre_trained self.n_tag = self.vectorizer.n_tag if 'bert' in pre_trained.lower(): self.tag2vec = None import sys if pre_trained == 'bert-multi': from transformers import BertModel, BertConfig bert_config = BertConfig.from_pretrained( 'bert-base-multilingual-cased', output_hidden_states=True) self.bert = BertModel(bert_config).to(device) elif pre_trained == 'sktkobert': from kobert.pytorch_kobert import get_pytorch_kobert_model #sys.path.append('/home/bwlee/work/codes/sentence_similarity/kobert') #from pytorch_kobert3 import get_pytorch_kobert_model self.bert, _ = get_pytorch_kobert_model() self.bert = self.bert.to(device) elif pre_trained == 'kbalbert': sys.path.append( '/home/bwlee/work/codes/KB-ALBERT-KO/kb-albert-char/') from transformers import AlbertModel kbalbert_path = '/home/bwlee/work/codes/KB-ALBERT-KO/kb-albert-char/model' self.bert = AlbertModel.from_pretrained( kbalbert_path, output_hidden_states=True) self.bert = self.bert.to(device) else: from transformers import BertModel, BertConfig bert_config = BertConfig.from_pretrained( pre_trained, output_hidden_states=True) self.bert = BertModel(bert_config).to(device) else: self.tag2vec = self.vectorizer.tag2vec self.n_vocab = len(self.vectorizer.tag2vec) if pre_trained == '': self.embed = nn.Embedding(num_embeddings=self.n_tag, embedding_dim=dim_embed, padding_idx=self.tag2ix[PAD_TAG]) def set_embed(self, weights=None, bias=None): if weights is not None: self.embed.weight.data = weights if bias is not None: self.embed.bias.data = bias def __call__(self, text_arr, flag_sent=True): """ check type_ids=None gives different result in bert-multi :param text_arr: accepts text in iterable form like batch """ if type(text_arr) is str: print('warning: text should be in batch form') text_arr = [text_arr] if self.pre_trained == '': return self._call_manual(text_arr) elif self.pre_trained == 'glove': return self._call_glove(text_arr) elif 'bert' in self.pre_trained: return self._call_bert(text_arr, flag_sent) def _call_manual(self, text_arr): """ :param text_arr: accepts text in iterable form like batch """ idss = [] for text in text_arr: seq = self.tokenizer.tokenize(text) ids = self.vectorizer.get_ids(seq) idss.append(ids) idss = torch.LongTensor(idss) return self.embed(idss) def _call_glove(self, text_arr): """ :param text_arr: accepts text in iterable form like batch """ vecs = [] dim_glove = len(self.vectorizer.tag2vec['the']) zero = [0] * dim_glove for text in text_arr: seq = self.tokenizer.tokenize(text) vec = [ self.vectorizer.tag2vec[token] if token in self.vectorizer.tags else zero for token in seq ] vecs.append(vec) return torch.tensor(vecs) def _call_bert(self, text_arr, flag_sent): idss, masks, type_ids = [], [], [] for text in text_arr: seq = self.tokenizer.tokenize(text) ids, mask, type_id = self.vectorizer.get_ids_bert(seq) idss.append(ids) masks.append(mask) type_ids.append(type_id) with torch.no_grad(): idss = torch.tensor(idss).to(device) masks = torch.tensor(masks).to(device) type_ids = torch.tensor(type_ids).to(device) #type_ids = None # bert-multi gives different values clss, last, hiddens = self.bert(idss, attention_mask=masks, token_type_ids=type_ids) #kbalbert if flag_sent is True: length = torch.sum(masks, dim=1) # lengths of words in each sentence length = torch.sqrt(length * 1.0).unsqueeze(1) masks2 = masks.unsqueeze(2) context = torch.sum(hiddens[-2] * masks2, dim=1) / length else: return clss, last, hiddens return context
def main(config_path): config = Box.from_yaml(config_path.open()) torch.cuda.set_device(config.train.device) logger = create_logger(name="MAIN") logger.info(f"[-] Config loaded from {config_path}") data_dir = Path(config.data.data_dir) save_dir = Path(config.data.save_dir) if not save_dir.exists(): save_dir.mkdir() transfo_dir = Path(config.data.transfo_dir) device = create_device(config.train.device) tokenizer = BertTokenizer.from_pretrained( str(transfo_dir), do_lower_case=(not config.data.cased)) global CLS global SEP global PAD CLS, SEP, PAD = tokenizer.convert_tokens_to_ids( ["[CLS]", "[SEP]", "[PAD]"]) bert_config = BertConfig.from_pretrained(str(transfo_dir)) # To extract representations from other layers bert_config.output_hidden_states = True model = BertModel(bert_config) model.to(device) model.eval() train_file = data_dir / "schema_dstc8+m2.2.json" train_vocab_file = save_dir / "train_schema_vocab.pkl" train_embed_file = save_dir / "train_schema_embed.pkl" train_desc_file = save_dir / "train_schema_desc.pkl" valid_file = data_dir / "dev" / "schema.json" valid_vocab_file = save_dir / "valid_schema_vocab.pkl" valid_embed_file = save_dir / "valid_schema_embed.pkl" valid_desc_file = save_dir / "valid_schema_desc.pkl" if (data_dir / "test").exists(): test_file = data_dir / "test" / "schema.json" test_vocab_file = save_dir / "test_schema_vocab.pkl" test_embed_file = save_dir / "test_schema_embed.pkl" test_desc_file = save_dir / "test_schema_desc.pkl" else: test_file = None test_vocab_file = None test_embed_file = None test_desc_file = None train_schema_vocab, train_desc = extract(train_file, config.data.concat_name) valid_schema_vocab, valid_desc = extract(valid_file, config.data.concat_name) if test_file is not None: test_schema_vocab, test_desc = extract(test_file, config.data.concat_name) else: test_schema_vocab = test_desc = None pickle.dump(train_schema_vocab, open(train_vocab_file, "wb")) pickle.dump(valid_schema_vocab, open(valid_vocab_file, "wb")) if test_schema_vocab is not None: pickle.dump(test_schema_vocab, open(test_vocab_file, "wb")) layer = config.data.schema.layer pooling = config.data.schema.pooling train_embed = [] for desc in tqdm(train_desc, leave=False): embed = [] for sent in tqdm(desc, leave=False): embed.append( get_rep(sent, model, tokenizer, layer, pooling, device)) embed = torch.stack(embed) train_embed.append(embed) train_desc = [[[word.text.lower() for word in spacy_tokenizer(sent)] for sent in desc] for desc in train_desc] pickle.dump(train_embed, open(train_embed_file, "wb")) pickle.dump(train_desc, open(train_desc_file, "wb")) valid_embed = [] for desc in tqdm(valid_desc, leave=False): embed = [] for sent in tqdm(desc, leave=False): embed.append( get_rep(sent, model, tokenizer, layer, pooling, device)) embed = torch.stack(embed) valid_embed.append(embed) valid_desc = [[[word.text.lower() for word in spacy_tokenizer(sent)] for sent in desc] for desc in valid_desc] pickle.dump(valid_embed, open(valid_embed_file, "wb")) pickle.dump(valid_desc, open(valid_desc_file, "wb")) if test_desc is None: exit() test_embed = [] for desc in tqdm(test_desc, leave=False): embed = [] for sent in tqdm(desc, leave=False): embed.append( get_rep(sent, model, tokenizer, layer, pooling, device)) embed = torch.stack(embed) test_embed.append(embed) test_desc = [[[word.text.lower() for word in spacy_tokenizer(sent)] for sent in desc] for desc in test_desc] pickle.dump(test_embed, open(test_embed_file, "wb")) pickle.dump(test_desc, open(test_desc_file, "wb"))