def __init__(self, config): super().__init__() self.config = config self.BERT_MODEL = os.path.abspath(sys.argv[0] + '/../sci_bert_base') self.KNOW_BERT_MODEL = os.path.abspath(sys.argv[0] + '/../sci_knowledge_bert_base') self.CHANNELS = 12 + 1 # from bert-base-uncased self.BERT_SIZE = 768 # from bert-base-uncased self.bert = CustomBertModel.from_pretrained(self.BERT_MODEL) self.tokenizer = BertTokenizer.from_pretrained(self.KNOW_BERT_MODEL) self.entity2id = modeling_util.get_entid(self.config["kg_path"] + '/entity2id.txt')
def __init__(self, args, vocab_subset=None): super().__init__() if args.bert_model_dir is None: raise ValueError('require bert_model_dir') self.dict_file = os.path.join(args.bert_model_dir, args.bert_vocab_name) print('loading ERNIE model from {}'.format(args.bert_model_dir)) # ERNIE is uncased do_lower_case = True # Load pre-trained model tokenizer (vocabulary) self.tokenizer = BertTokenizer.from_pretrained(args.bert_model_dir) cbt = CustomBasicTokenizer(do_lower_case=do_lower_case) self.tokenizer.basic_tokenizer = cbt # original vocab self.map_indices = None self.vocab = list(self.tokenizer.ids_to_tokens.values()) self._init_inverse_vocab() # Load pre-trained model (weights) self.ernie_model, _ = BertForMaskedLM.from_pretrained( args.bert_model_dir) self.ernie_model.eval() # Load entity embeddings print('loading entity embeddings') self.ent_map = {} with open(os.path.join(args.kg_path, 'entity_map.txt')) as fin: for line in fin: name, qid = line.strip().split('\t') self.ent_map[name] = qid self.entity2id = {} with open(os.path.join(args.kg_path, 'entity2id.txt'), 'r') as fin: fin.readline() for line in fin: qid, eid = line.strip().split('\t') self.entity2id[qid] = int(eid) vecs = np.load(os.path.join( args.kg_path, 'entity2vec.npy')) # the first element is pad with all zeros self.kg_emb = torch.nn.Embedding.from_pretrained( torch.FloatTensor(vecs)) self.bert_model = self.ernie_model.bert self.pad_id = self.inverse_vocab[BERT_PAD] self.unk_index = self.inverse_vocab[BERT_UNK]
import torch from knowledge_bert import BertTokenizer, BertModel, BertForMaskedLM # OPTIONAL: if you want to have more information on what's happening, activate the logger as follows import logging logging.basicConfig(level=logging.INFO) # Load pre-trained model tokenizer (vocabulary) tokenizer = BertTokenizer.from_pretrained('ernie_base') # Tokenized input text_a = "Who was Jim Henson ? " text_b = "Jim Henson was a puppeteer ." # Use TAGME import tagme # Set the authorization token for subsequent calls. tagme.GCUBE_TOKEN = "<Your token goes here>" text_a_ann = tagme.annotate(text_a) text_b_ann = tagme.annotate(text_b) # Read entity map ent_map = {} with open("kg_embed/entity_map.txt") as fin: for line in fin: name, qid = line.strip().split("\t") ent_map[name] = qid def get_ents(ann): ents = []
def main(): device = get_device() if NUM_LABELS == 2: class_names = ['True', 'Fake'] else: class_names = [ 'True', 'Mostly-true', 'Half-true', 'Barely-true', 'False', 'Pants-fire' ] data_processor = CovidDataProcessor() labels, statements = data_processor.load_dataset() labels = { 'train': CovidDataProcessor.convert_labels(NUM_LABELS, labels['train']), 'test': CovidDataProcessor.convert_labels(NUM_LABELS, labels['test']), 'validation': CovidDataProcessor.convert_labels(NUM_LABELS, labels['validation']) } # Load pre-trained model tokenizer tokenizer = BertTokenizer.from_pretrained(ERNIE_BASE_PATH, do_lower_case=True) with open('embed.txt', 'rb') as f: embed = pickle.load(f) with open('entity2id.txt', 'rb') as f: entity2id = pickle.load(f) with open('ent_map.txt', 'rb') as f: ent_map = pickle.load(f) # # currently all saved dataloader is generated with batch_size = 4, if change to other batch_size, need to regenerate # if Path('covid_train_dataloader.txt').is_file(): # with open('covid_train_dataloader.txt', 'rb') as ff: # train_dataloader = pickle.load(ff) # else: # print('generating train_dataloader') # train_dataloader = CovidDataProcessor.get_ernie_dataloader(statements['train'], labels['train'], MAX_LEN, # tokenizer, BATCH_SIZE, entity2id, ent_map) # with open('covid_train_dataloader.txt', 'wb') as ff: # pickle.dump(train_dataloader, ff) # # if Path('covid_test_dataloader.txt').is_file(): # with open('covid_test_dataloader.txt', 'rb') as ff: # test_dataloader = pickle.load(ff) # else: # print('generating test_dataloader') # test_dataloader = CovidDataProcessor.get_ernie_dataloader(statements['test'], labels['test'], MAX_LEN, # tokenizer, BATCH_SIZE, entity2id, ent_map) # with open('covid_test_dataloader.txt', 'wb') as ff: # pickle.dump(test_dataloader, ff) # # if Path('covid_val_dataloader.txt').is_file(): # with open('covid_val_dataloader.txt', 'rb') as ff: # validation_dataloader = pickle.load(ff) # else: # print('generating validation_dataloader') # validation_dataloader = CovidDataProcessor.get_ernie_dataloader(statements['validation'], labels['validation'], # MAX_LEN, tokenizer, BATCH_SIZE, entity2id, # ent_map) # with open('covid_val_dataloader.txt', 'wb') as ff: # pickle.dump(validation_dataloader, ff) # currently all saved dataloader is generated with batch_size = 4, if change to other batch_size, need to regenerate # we now try to give RandomSampler to dataloader # if Path('covid_train_dataloader_random.txt').is_file(): # with open('covid_train_dataloader_random.txt', 'rb') as ff: # train_dataloader = pickle.load(ff) # else: # print('generating train_dataloader') # train_dataloader = CovidDataProcessor.get_ernie_dataloader(statements['train'], labels['train'], MAX_LEN, # tokenizer, BATCH_SIZE, entity2id, ent_map) # with open('covid_train_dataloader_random.txt', 'wb') as ff: # pickle.dump(train_dataloader, ff) # # if Path('covid_test_dataloader_random.txt').is_file(): # with open('covid_test_dataloader_random.txt', 'rb') as ff: # test_dataloader = pickle.load(ff) # else: # print('generating test_dataloader') # test_dataloader = CovidDataProcessor.get_ernie_dataloader(statements['test'], labels['test'], MAX_LEN, # tokenizer, BATCH_SIZE, entity2id, ent_map) # with open('covid_test_dataloader_random.txt', 'wb') as ff: # pickle.dump(test_dataloader, ff) # # if Path('covid_val_dataloader.txt_random').is_file(): # with open('covid_val_dataloader.txt_random', 'rb') as ff: # validation_dataloader = pickle.load(ff) # else: # print('generating validation_dataloader') # validation_dataloader = CovidDataProcessor.get_ernie_dataloader(statements['validation'], # labels['validation'], # MAX_LEN, tokenizer, BATCH_SIZE, entity2id, # ent_map) # with open('covid_val_dataloader.txt_random', 'wb') as ff: # pickle.dump(validation_dataloader, ff) # # all liar saved dataloader is generated with batch_size = 2, if change to other batch_size, need to regenerate # if Path('liar_train_dataloader.txt').is_file(): # with open('liar_train_dataloader.txt', 'rb') as ff: # train_dataloader = pickle.load(ff) # else: # print('generating train_dataloader') # train_dataloader = CovidDataProcessor.get_ernie_dataloader(statements['train'], labels['train'], MAX_LEN, # tokenizer, BATCH_SIZE, entity2id, ent_map) # with open('liar_train_dataloader.txt', 'wb') as ff: # pickle.dump(train_dataloader, ff) # # if Path('liar_test_dataloader.txt').is_file(): # with open('liar_test_dataloader.txt', 'rb') as ff: # test_dataloader = pickle.load(ff) # else: # print('generating test_dataloader') # test_dataloader = CovidDataProcessor.get_ernie_dataloader(statements['test'], labels['test'], MAX_LEN, # tokenizer, BATCH_SIZE, entity2id, ent_map) # with open('liar_test_dataloader.txt', 'wb') as ff: # pickle.dump(test_dataloader, ff) # # if Path('liar_val_dataloader.txt').is_file(): # with open('liar_val_dataloader.txt', 'rb') as ff: # validation_dataloader = pickle.load(ff) # else: # print('generating validation_dataloader') # validation_dataloader = CovidDataProcessor.get_ernie_dataloader(statements['validation'], labels['validation'], # MAX_LEN, tokenizer, BATCH_SIZE, entity2id, # ent_map) # with open('liar_val_dataloader.txt', 'wb') as ff: # pickle.dump(validation_dataloader, ff) # above is when num_labels = 6, now consider binary cases for liar if Path('binary_liar_train_dataloader.txt').is_file(): with open('binary_liar_train_dataloader.txt', 'rb') as ff: train_dataloader = pickle.load(ff) else: print('generating train_dataloader') train_dataloader = CovidDataProcessor.get_ernie_dataloader( statements['train'], labels['train'], MAX_LEN, tokenizer, BATCH_SIZE, entity2id, ent_map) with open('binary_liar_train_dataloader.txt', 'wb') as ff: pickle.dump(train_dataloader, ff) if Path('binary_liar_test_dataloader.txt').is_file(): with open('binary_liar_test_dataloader.txt', 'rb') as ff: test_dataloader = pickle.load(ff) else: print('generating test_dataloader') test_dataloader = CovidDataProcessor.get_ernie_dataloader( statements['test'], labels['test'], MAX_LEN, tokenizer, BATCH_SIZE, entity2id, ent_map) with open('binary_liar_test_dataloader.txt', 'wb') as ff: pickle.dump(test_dataloader, ff) if Path('binary_liar_val_dataloader.txt').is_file(): with open('binary_liar_val_dataloader.txt', 'rb') as ff: validation_dataloader = pickle.load(ff) else: print('generating validation_dataloader') validation_dataloader = CovidDataProcessor.get_ernie_dataloader( statements['validation'], labels['validation'], MAX_LEN, tokenizer, BATCH_SIZE, entity2id, ent_map) with open('binary_liar_val_dataloader.txt', 'wb') as ff: pickle.dump(validation_dataloader, ff) loss_fn = torch.nn.CrossEntropyLoss().to(device) # Train model model, _ = BertForSequenceClassification.from_pretrained( ERNIE_BASE_PATH, num_labels=NUM_LABELS) model.to(device) ErnieModel.train_model(model, train_dataloader, validation_dataloader, EPOCHS, device, loss_fn, embed) # evaluate model on test dataset test_acc, test_loss = ErnieModel.eval_model(model, test_dataloader, device, embed) print('test accuracy: ', test_acc.item()) # predictions pred, test_labels = ErnieModel.get_predictions(model, test_dataloader, device, embed) print( classification_report(test_labels, pred, target_names=class_names, digits=4))
def main(): parser = argparse.ArgumentParser() parser.add_argument( "--data_dir", default=None, type=str, required=True, help="The input data dir.", ) parser.add_argument( "--model_name_or_path", default=None, type=str, required=True, help= "Path to pretrained model or model identifier from huggingface.co/models", ) parser.add_argument( "--model_type", default=None, type=str, required=True, help="Type of model to train.", ) parser.add_argument( "--model_save_name", default=None, type=str, required=True, help= "Path to pretrained model or model identifier from huggingface.co/models", ) parser.add_argument( "--train_setting", default='relaxed', type=str, required=False, help= "Whether to train in strict setting or relaxed setting. Options: strict or relaxed", ) parser.add_argument( "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model.") parser.add_argument("--do_train", action="store_true", help="Whether to run training.") parser.add_argument("--do_eval", action="store_true", help="Whether to run the model on the dev set.") parser.add_argument("--do_test", action="store_true", help="Whether to run the model on the test set.") parser.add_argument("--evaluate_during_training", action="store_true", help="Whether to evaluate during training.") parser.add_argument("--multi_task", action="store_true", help="Multi-task learning flag.") parser.add_argument("--train_batch_size", default=20, type=int, help="Batch size per GPU/CPU for training.") parser.add_argument("--train_epochs", default=5, type=int, help="Training epochs.") parser.add_argument("--GRAD_ACC", default=1, type=int, help="Gradient accumulation steps.") parser.add_argument("--eval_batch_size", default=20, type=int, help="Batch size per GPU/CPU for evaluation/testing.") parser.add_argument("--lr", default=2e-5, type=float, help="Learning rate.") parser.add_argument("--auxiliary_task_wt", default=0.3, type=float, help="Weight for the auxiliary task.") parser.add_argument("--weight_decay", default=1e-4, type=float, help="Weight decay.") parser.add_argument("--warmup_proportion", default=0.1, type=float, help="Warmup proportion.") parser.add_argument("--gpu", default=0, type=int, help="which GPU is to be used for training.") args = parser.parse_args() data = pickle.load(open(args.data_dir, 'rb')) selected_sem_types = pickle.load(open('../data/selected_ents.pkl', 'rb')) print('Selected semantic types: ', selected_sem_types) if args.train_setting == 'strict': data = data['strict_split'] else: data = data['split'] entity2id = utils.prepare_entities_to_ix(selected_sem_types) logical2ix = utils.prepare_logical_forms_to_ix(data['train']) shuffle(data['train']) shuffle(data['dev']) shuffle(data['test']) print(entity2id) model_config = { 'label_size': 2, 'num_entities': len(selected_sem_types) + 1, 'entity_dim': 100, 'lr': args.lr, 'weight_decay': args.weight_decay, 'batch_size': args.train_batch_size, 'data_path': args.data_dir, 'model_name': args.model_save_name, 'bert_model': args.model_name_or_path, 'do_lower_case': True, 'gradient_accumulation_steps': args.GRAD_ACC } if args.model_type == 'ernie': from knowledge_bert import modeling from knowledge_bert import BertTokenizer from knowledge_bert.optimization import BertAdam tokenizer = BertTokenizer.from_pretrained( model_config['bert_model'], do_lower_case=model_config['do_lower_case']) model, _ = modeling.BertForQuestionAnsweringEmrQA.from_pretrained( model_config['bert_model'], num_entities=model_config['num_entities']) elif args.model_type == 'bert': from pytorch_pretrained_bert import BertTokenizer, BertForQuestionAnswering from pytorch_pretrained_bert.optimization import BertAdam tokenizer = BertTokenizer.from_pretrained( model_config['bert_model'], do_lower_case=model_config['do_lower_case']) model = BertForQuestionAnswering.from_pretrained( model_config['bert_model']) num_train_optimization_steps = len( data['train'] ) // model_config['gradient_accumulation_steps'] * args.train_epochs # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = BertAdam(optimizer_grouped_parameters, lr=model_config['lr'], warmup=args.warmup_proportion, t_total=num_train_optimization_steps) if args.do_train: model_trained = train(args, model=model, optimizer=optimizer, tokenizer=tokenizer, model_config=model_config, data=data, entity2id=entity2id, logical2ix=logical2ix) # The start and end accuracy are just proxies, actual accuracy would be calculated from the pickle dump using the script of SQuAD evaluate: https://rajpurkar.github.io/SQuAD-explorer/ ##### Evaluate the model if do_eval flag is on if args.do_eval: if args.model_type == 'ernie': if args.multi_task: device = torch.device("cuda:" + str(args.gpu)) dev_vals = eval_plot.evaluate_bert_emrqa_ernie_multitask( model_trained, data['dev'], args.eval_batch_size, tokenizer, entity2id, logical2ix, device) else: dev_vals = eval_plot.evaluate_bert_emrqa_ernie( model_trained, data['dev'], args.eval_batch_size, tokenizer, entity2id, logical2ix) elif args.model_type == 'bert': dev_vals = eval_plot.evaluate_bert_emrqa(model_trained, data['dev'], args.eval_batch_size, tokenizer) dict_ = { 'start_accuracy': dev_vals[0], 'end_accuracy': dev_vals[1], 'actual_and_predicted_values': dev_vals[2] } file_name = '../results/' + model_config[ 'model_name'] + '_dev_results.pkl' pickle.dump(dict_, open(file_name, 'wb')) ##### Test the model if args.do_test: if args.model_type == 'ernie': if args.multi_task: device = torch.device("cuda:" + str(args.gpu)) test_vals = eval_plot.evaluate_bert_emrqa_ernie_multitask( model_trained, data['test'], args.eval_batch_size, tokenizer, entity2id, logical2ix, device) else: test_vals = eval_plot.evaluate_bert_emrqa_ernie( model_trained, data['test'], args.eval_batch_size, tokenizer, entity2id, logical2ix) elif args.model_type == 'bert': test_vals = eval_plot.evaluate_bert_emrqa(model_trained, data['dev'], args.eval_batch_size, tokenizer) dict_ = { 'start_accuracy': test_vals[0], 'end_accuracy': test_vals[1], 'actual_and_predicted_values': test_vals[2] } file_name = '../results/' + model_config[ 'model_name'] + '_test_results.pkl' pickle.dump(dict_, open(file_name, 'wb'))
# coding:utf-8 import json import numpy as np import torch from knowledge_bert import BertTokenizer, BertModel, BertForMaskedLM tokenizer = BertTokenizer.from_pretrained( '/home/liuyu/kesel/global_encoder/ernie_pretrain/ernie_base') ''' path = /home/liuyu/kesel/local_encoder/process_data/mention_rank_wiki target: mention_rank_wiki is processed as Ernie model input format three mentions form a sequence, text_seq and ents_seq format is as follows: text_a, ents_a = "Jim Henson was a puppeteer .", [['Q191037', 0, 10, 0.0], ['Q2629392', 17, 26, 0.0]] labels_seq shape(batch, seq) | ents_index shape(batch, seq) | cands_ents_id shape(batch, seq, n_cands) ''' class DataUtil(): def init(self): pass def process_global_data(self, dname, data_path, local_rep_path, group_path, local_fea_path, seq_len, candidate_entity_num): """ :param data_path: :param local_rep_path: local context represents :param group_path: mention按文档划分的group :param seq_len: 文档中mention序列长度 :param candidate_entity_num: 每个mention对应候选实体数量