Пример #1
0
 def __init__(self, config):
     super().__init__()
     self.config = config
     self.BERT_MODEL = os.path.abspath(sys.argv[0] + '/../sci_bert_base')
     self.KNOW_BERT_MODEL = os.path.abspath(sys.argv[0] +
                                            '/../sci_knowledge_bert_base')
     self.CHANNELS = 12 + 1  # from bert-base-uncased
     self.BERT_SIZE = 768  # from bert-base-uncased
     self.bert = CustomBertModel.from_pretrained(self.BERT_MODEL)
     self.tokenizer = BertTokenizer.from_pretrained(self.KNOW_BERT_MODEL)
     self.entity2id = modeling_util.get_entid(self.config["kg_path"] +
                                              '/entity2id.txt')
Пример #2
0
    def __init__(self, args, vocab_subset=None):
        super().__init__()

        if args.bert_model_dir is None:
            raise ValueError('require bert_model_dir')

        self.dict_file = os.path.join(args.bert_model_dir,
                                      args.bert_vocab_name)
        print('loading ERNIE model from {}'.format(args.bert_model_dir))

        # ERNIE is uncased
        do_lower_case = True

        # Load pre-trained model tokenizer (vocabulary)
        self.tokenizer = BertTokenizer.from_pretrained(args.bert_model_dir)
        cbt = CustomBasicTokenizer(do_lower_case=do_lower_case)
        self.tokenizer.basic_tokenizer = cbt

        # original vocab
        self.map_indices = None
        self.vocab = list(self.tokenizer.ids_to_tokens.values())
        self._init_inverse_vocab()

        # Load pre-trained model (weights)
        self.ernie_model, _ = BertForMaskedLM.from_pretrained(
            args.bert_model_dir)
        self.ernie_model.eval()

        # Load entity embeddings
        print('loading entity embeddings')
        self.ent_map = {}
        with open(os.path.join(args.kg_path, 'entity_map.txt')) as fin:
            for line in fin:
                name, qid = line.strip().split('\t')
                self.ent_map[name] = qid

        self.entity2id = {}
        with open(os.path.join(args.kg_path, 'entity2id.txt'), 'r') as fin:
            fin.readline()
            for line in fin:
                qid, eid = line.strip().split('\t')
                self.entity2id[qid] = int(eid)
        vecs = np.load(os.path.join(
            args.kg_path,
            'entity2vec.npy'))  # the first element is pad with all zeros
        self.kg_emb = torch.nn.Embedding.from_pretrained(
            torch.FloatTensor(vecs))

        self.bert_model = self.ernie_model.bert
        self.pad_id = self.inverse_vocab[BERT_PAD]
        self.unk_index = self.inverse_vocab[BERT_UNK]
Пример #3
0
import torch
from knowledge_bert import BertTokenizer, BertModel, BertForMaskedLM

# OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
import logging
logging.basicConfig(level=logging.INFO)

# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('ernie_base')

# Tokenized input
text_a = "Who was Jim Henson ? "
text_b = "Jim Henson was a puppeteer ."

# Use TAGME
import tagme
# Set the authorization token for subsequent calls.
tagme.GCUBE_TOKEN = "<Your token goes here>"
text_a_ann = tagme.annotate(text_a)
text_b_ann = tagme.annotate(text_b)

# Read entity map
ent_map = {}
with open("kg_embed/entity_map.txt") as fin:
    for line in fin:
        name, qid = line.strip().split("\t")
        ent_map[name] = qid


def get_ents(ann):
    ents = []
Пример #4
0
def main():
    device = get_device()
    if NUM_LABELS == 2:
        class_names = ['True', 'Fake']
    else:
        class_names = [
            'True', 'Mostly-true', 'Half-true', 'Barely-true', 'False',
            'Pants-fire'
        ]

    data_processor = CovidDataProcessor()
    labels, statements = data_processor.load_dataset()

    labels = {
        'train':
        CovidDataProcessor.convert_labels(NUM_LABELS, labels['train']),
        'test':
        CovidDataProcessor.convert_labels(NUM_LABELS, labels['test']),
        'validation':
        CovidDataProcessor.convert_labels(NUM_LABELS, labels['validation'])
    }

    # Load pre-trained model tokenizer
    tokenizer = BertTokenizer.from_pretrained(ERNIE_BASE_PATH,
                                              do_lower_case=True)

    with open('embed.txt', 'rb') as f:
        embed = pickle.load(f)
    with open('entity2id.txt', 'rb') as f:
        entity2id = pickle.load(f)
    with open('ent_map.txt', 'rb') as f:
        ent_map = pickle.load(f)

    # # currently all saved dataloader is generated with batch_size = 4, if change to other batch_size, need to regenerate
    # if Path('covid_train_dataloader.txt').is_file():
    #     with open('covid_train_dataloader.txt', 'rb') as ff:
    #         train_dataloader = pickle.load(ff)
    # else:
    #     print('generating train_dataloader')
    #     train_dataloader = CovidDataProcessor.get_ernie_dataloader(statements['train'], labels['train'], MAX_LEN,
    #                                                                tokenizer, BATCH_SIZE, entity2id, ent_map)
    #     with open('covid_train_dataloader.txt', 'wb') as ff:
    #         pickle.dump(train_dataloader, ff)
    #
    # if Path('covid_test_dataloader.txt').is_file():
    #     with open('covid_test_dataloader.txt', 'rb') as ff:
    #         test_dataloader = pickle.load(ff)
    # else:
    #     print('generating test_dataloader')
    #     test_dataloader = CovidDataProcessor.get_ernie_dataloader(statements['test'], labels['test'], MAX_LEN,
    #                                                               tokenizer, BATCH_SIZE, entity2id, ent_map)
    #     with open('covid_test_dataloader.txt', 'wb') as ff:
    #         pickle.dump(test_dataloader, ff)
    #
    # if Path('covid_val_dataloader.txt').is_file():
    #     with open('covid_val_dataloader.txt', 'rb') as ff:
    #         validation_dataloader = pickle.load(ff)
    # else:
    #     print('generating validation_dataloader')
    #     validation_dataloader = CovidDataProcessor.get_ernie_dataloader(statements['validation'], labels['validation'],
    #                                                                     MAX_LEN, tokenizer, BATCH_SIZE, entity2id,
    #                                                                     ent_map)
    #     with open('covid_val_dataloader.txt', 'wb') as ff:
    #         pickle.dump(validation_dataloader, ff)

    # currently all saved dataloader is generated with batch_size = 4, if change to other batch_size, need to regenerate
    # we now try to give RandomSampler to dataloader
    # if Path('covid_train_dataloader_random.txt').is_file():
    #     with open('covid_train_dataloader_random.txt', 'rb') as ff:
    #         train_dataloader = pickle.load(ff)
    # else:
    #     print('generating train_dataloader')
    #     train_dataloader = CovidDataProcessor.get_ernie_dataloader(statements['train'], labels['train'], MAX_LEN,
    #                                                                tokenizer, BATCH_SIZE, entity2id, ent_map)
    #     with open('covid_train_dataloader_random.txt', 'wb') as ff:
    #         pickle.dump(train_dataloader, ff)
    #
    # if Path('covid_test_dataloader_random.txt').is_file():
    #     with open('covid_test_dataloader_random.txt', 'rb') as ff:
    #         test_dataloader = pickle.load(ff)
    # else:
    #     print('generating test_dataloader')
    #     test_dataloader = CovidDataProcessor.get_ernie_dataloader(statements['test'], labels['test'], MAX_LEN,
    #                                                               tokenizer, BATCH_SIZE, entity2id, ent_map)
    #     with open('covid_test_dataloader_random.txt', 'wb') as ff:
    #         pickle.dump(test_dataloader, ff)
    #
    # if Path('covid_val_dataloader.txt_random').is_file():
    #     with open('covid_val_dataloader.txt_random', 'rb') as ff:
    #         validation_dataloader = pickle.load(ff)
    # else:
    #     print('generating validation_dataloader')
    #     validation_dataloader = CovidDataProcessor.get_ernie_dataloader(statements['validation'],
    #                                                                     labels['validation'],
    #                                                                     MAX_LEN, tokenizer, BATCH_SIZE, entity2id,
    #                                                                     ent_map)
    #     with open('covid_val_dataloader.txt_random', 'wb') as ff:
    #         pickle.dump(validation_dataloader, ff)

    # # all liar saved dataloader is generated with batch_size = 2, if change to other batch_size, need to regenerate
    # if Path('liar_train_dataloader.txt').is_file():
    #     with open('liar_train_dataloader.txt', 'rb') as ff:
    #         train_dataloader = pickle.load(ff)
    # else:
    #     print('generating train_dataloader')
    #     train_dataloader = CovidDataProcessor.get_ernie_dataloader(statements['train'], labels['train'], MAX_LEN,
    #                                                                tokenizer, BATCH_SIZE, entity2id, ent_map)
    #     with open('liar_train_dataloader.txt', 'wb') as ff:
    #         pickle.dump(train_dataloader, ff)
    #
    # if Path('liar_test_dataloader.txt').is_file():
    #     with open('liar_test_dataloader.txt', 'rb') as ff:
    #         test_dataloader = pickle.load(ff)
    # else:
    #     print('generating test_dataloader')
    #     test_dataloader = CovidDataProcessor.get_ernie_dataloader(statements['test'], labels['test'], MAX_LEN,
    #                                                               tokenizer, BATCH_SIZE, entity2id, ent_map)
    #     with open('liar_test_dataloader.txt', 'wb') as ff:
    #         pickle.dump(test_dataloader, ff)
    #
    # if Path('liar_val_dataloader.txt').is_file():
    #     with open('liar_val_dataloader.txt', 'rb') as ff:
    #         validation_dataloader = pickle.load(ff)
    # else:
    #     print('generating validation_dataloader')
    #     validation_dataloader = CovidDataProcessor.get_ernie_dataloader(statements['validation'], labels['validation'],
    #                                                                     MAX_LEN, tokenizer, BATCH_SIZE, entity2id,
    #                                                                     ent_map)
    #     with open('liar_val_dataloader.txt', 'wb') as ff:
    #         pickle.dump(validation_dataloader, ff)

    # above is when num_labels = 6, now consider binary cases for liar
    if Path('binary_liar_train_dataloader.txt').is_file():
        with open('binary_liar_train_dataloader.txt', 'rb') as ff:
            train_dataloader = pickle.load(ff)
    else:
        print('generating train_dataloader')
        train_dataloader = CovidDataProcessor.get_ernie_dataloader(
            statements['train'], labels['train'], MAX_LEN, tokenizer,
            BATCH_SIZE, entity2id, ent_map)
        with open('binary_liar_train_dataloader.txt', 'wb') as ff:
            pickle.dump(train_dataloader, ff)

    if Path('binary_liar_test_dataloader.txt').is_file():
        with open('binary_liar_test_dataloader.txt', 'rb') as ff:
            test_dataloader = pickle.load(ff)
    else:
        print('generating test_dataloader')
        test_dataloader = CovidDataProcessor.get_ernie_dataloader(
            statements['test'], labels['test'], MAX_LEN, tokenizer, BATCH_SIZE,
            entity2id, ent_map)
        with open('binary_liar_test_dataloader.txt', 'wb') as ff:
            pickle.dump(test_dataloader, ff)

    if Path('binary_liar_val_dataloader.txt').is_file():
        with open('binary_liar_val_dataloader.txt', 'rb') as ff:
            validation_dataloader = pickle.load(ff)
    else:
        print('generating validation_dataloader')
        validation_dataloader = CovidDataProcessor.get_ernie_dataloader(
            statements['validation'], labels['validation'], MAX_LEN, tokenizer,
            BATCH_SIZE, entity2id, ent_map)
        with open('binary_liar_val_dataloader.txt', 'wb') as ff:
            pickle.dump(validation_dataloader, ff)

    loss_fn = torch.nn.CrossEntropyLoss().to(device)

    # Train model
    model, _ = BertForSequenceClassification.from_pretrained(
        ERNIE_BASE_PATH, num_labels=NUM_LABELS)
    model.to(device)

    ErnieModel.train_model(model, train_dataloader, validation_dataloader,
                           EPOCHS, device, loss_fn, embed)

    # evaluate model on test dataset
    test_acc, test_loss = ErnieModel.eval_model(model, test_dataloader, device,
                                                embed)
    print('test accuracy: ', test_acc.item())

    # predictions
    pred, test_labels = ErnieModel.get_predictions(model, test_dataloader,
                                                   device, embed)

    print(
        classification_report(test_labels,
                              pred,
                              target_names=class_names,
                              digits=4))
Пример #5
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--data_dir",
        default=None,
        type=str,
        required=True,
        help="The input data dir.",
    )
    parser.add_argument(
        "--model_name_or_path",
        default=None,
        type=str,
        required=True,
        help=
        "Path to pretrained model or model identifier from huggingface.co/models",
    )
    parser.add_argument(
        "--model_type",
        default=None,
        type=str,
        required=True,
        help="Type of model to train.",
    )
    parser.add_argument(
        "--model_save_name",
        default=None,
        type=str,
        required=True,
        help=
        "Path to pretrained model or model identifier from huggingface.co/models",
    )
    parser.add_argument(
        "--train_setting",
        default='relaxed',
        type=str,
        required=False,
        help=
        "Whether to train in strict setting or relaxed setting. Options: strict or relaxed",
    )
    parser.add_argument(
        "--do_lower_case",
        action="store_true",
        help="Set this flag if you are using an uncased model.")
    parser.add_argument("--do_train",
                        action="store_true",
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        action="store_true",
                        help="Whether to run the model on the dev set.")
    parser.add_argument("--do_test",
                        action="store_true",
                        help="Whether to run the model on the test set.")
    parser.add_argument("--evaluate_during_training",
                        action="store_true",
                        help="Whether to evaluate during training.")
    parser.add_argument("--multi_task",
                        action="store_true",
                        help="Multi-task learning flag.")

    parser.add_argument("--train_batch_size",
                        default=20,
                        type=int,
                        help="Batch size per GPU/CPU for training.")
    parser.add_argument("--train_epochs",
                        default=5,
                        type=int,
                        help="Training epochs.")
    parser.add_argument("--GRAD_ACC",
                        default=1,
                        type=int,
                        help="Gradient accumulation steps.")
    parser.add_argument("--eval_batch_size",
                        default=20,
                        type=int,
                        help="Batch size per GPU/CPU for evaluation/testing.")
    parser.add_argument("--lr",
                        default=2e-5,
                        type=float,
                        help="Learning rate.")
    parser.add_argument("--auxiliary_task_wt",
                        default=0.3,
                        type=float,
                        help="Weight for the auxiliary task.")
    parser.add_argument("--weight_decay",
                        default=1e-4,
                        type=float,
                        help="Weight decay.")
    parser.add_argument("--warmup_proportion",
                        default=0.1,
                        type=float,
                        help="Warmup proportion.")
    parser.add_argument("--gpu",
                        default=0,
                        type=int,
                        help="which GPU is to be used for training.")

    args = parser.parse_args()

    data = pickle.load(open(args.data_dir, 'rb'))
    selected_sem_types = pickle.load(open('../data/selected_ents.pkl', 'rb'))
    print('Selected semantic types: ', selected_sem_types)

    if args.train_setting == 'strict':
        data = data['strict_split']
    else:
        data = data['split']

    entity2id = utils.prepare_entities_to_ix(selected_sem_types)
    logical2ix = utils.prepare_logical_forms_to_ix(data['train'])

    shuffle(data['train'])
    shuffle(data['dev'])
    shuffle(data['test'])
    print(entity2id)

    model_config = {
        'label_size': 2,
        'num_entities': len(selected_sem_types) + 1,
        'entity_dim': 100,
        'lr': args.lr,
        'weight_decay': args.weight_decay,
        'batch_size': args.train_batch_size,
        'data_path': args.data_dir,
        'model_name': args.model_save_name,
        'bert_model': args.model_name_or_path,
        'do_lower_case': True,
        'gradient_accumulation_steps': args.GRAD_ACC
    }

    if args.model_type == 'ernie':
        from knowledge_bert import modeling
        from knowledge_bert import BertTokenizer
        from knowledge_bert.optimization import BertAdam

        tokenizer = BertTokenizer.from_pretrained(
            model_config['bert_model'],
            do_lower_case=model_config['do_lower_case'])
        model, _ = modeling.BertForQuestionAnsweringEmrQA.from_pretrained(
            model_config['bert_model'],
            num_entities=model_config['num_entities'])
    elif args.model_type == 'bert':
        from pytorch_pretrained_bert import BertTokenizer, BertForQuestionAnswering
        from pytorch_pretrained_bert.optimization import BertAdam
        tokenizer = BertTokenizer.from_pretrained(
            model_config['bert_model'],
            do_lower_case=model_config['do_lower_case'])
        model = BertForQuestionAnswering.from_pretrained(
            model_config['bert_model'])

    num_train_optimization_steps = len(
        data['train']
    ) // model_config['gradient_accumulation_steps'] * args.train_epochs

    # Prepare optimizer
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]

    optimizer = BertAdam(optimizer_grouped_parameters,
                         lr=model_config['lr'],
                         warmup=args.warmup_proportion,
                         t_total=num_train_optimization_steps)

    if args.do_train:
        model_trained = train(args,
                              model=model,
                              optimizer=optimizer,
                              tokenizer=tokenizer,
                              model_config=model_config,
                              data=data,
                              entity2id=entity2id,
                              logical2ix=logical2ix)

    # The start and end accuracy are just proxies, actual accuracy would be calculated from the pickle dump using the script of SQuAD evaluate: https://rajpurkar.github.io/SQuAD-explorer/
    ##### Evaluate the model if do_eval flag is on
    if args.do_eval:
        if args.model_type == 'ernie':
            if args.multi_task:
                device = torch.device("cuda:" + str(args.gpu))
                dev_vals = eval_plot.evaluate_bert_emrqa_ernie_multitask(
                    model_trained, data['dev'], args.eval_batch_size,
                    tokenizer, entity2id, logical2ix, device)
            else:
                dev_vals = eval_plot.evaluate_bert_emrqa_ernie(
                    model_trained, data['dev'], args.eval_batch_size,
                    tokenizer, entity2id, logical2ix)
        elif args.model_type == 'bert':
            dev_vals = eval_plot.evaluate_bert_emrqa(model_trained,
                                                     data['dev'],
                                                     args.eval_batch_size,
                                                     tokenizer)
        dict_ = {
            'start_accuracy': dev_vals[0],
            'end_accuracy': dev_vals[1],
            'actual_and_predicted_values': dev_vals[2]
        }
        file_name = '../results/' + model_config[
            'model_name'] + '_dev_results.pkl'
        pickle.dump(dict_, open(file_name, 'wb'))

    ##### Test the model
    if args.do_test:
        if args.model_type == 'ernie':
            if args.multi_task:
                device = torch.device("cuda:" + str(args.gpu))
                test_vals = eval_plot.evaluate_bert_emrqa_ernie_multitask(
                    model_trained, data['test'], args.eval_batch_size,
                    tokenizer, entity2id, logical2ix, device)
            else:
                test_vals = eval_plot.evaluate_bert_emrqa_ernie(
                    model_trained, data['test'], args.eval_batch_size,
                    tokenizer, entity2id, logical2ix)
        elif args.model_type == 'bert':
            test_vals = eval_plot.evaluate_bert_emrqa(model_trained,
                                                      data['dev'],
                                                      args.eval_batch_size,
                                                      tokenizer)
        dict_ = {
            'start_accuracy': test_vals[0],
            'end_accuracy': test_vals[1],
            'actual_and_predicted_values': test_vals[2]
        }
        file_name = '../results/' + model_config[
            'model_name'] + '_test_results.pkl'
        pickle.dump(dict_, open(file_name, 'wb'))
Пример #6
0
# coding:utf-8

import json
import numpy as np
import torch

from knowledge_bert import BertTokenizer, BertModel, BertForMaskedLM
tokenizer = BertTokenizer.from_pretrained(
    '/home/liuyu/kesel/global_encoder/ernie_pretrain/ernie_base')
'''
path = /home/liuyu/kesel/local_encoder/process_data/mention_rank_wiki
target:  mention_rank_wiki is processed as Ernie model input format
three mentions form a sequence, text_seq and ents_seq format is as follows:
text_a, ents_a = "Jim Henson was a puppeteer .", [['Q191037', 0, 10, 0.0], ['Q2629392', 17, 26, 0.0]]
labels_seq shape(batch, seq)  | ents_index shape(batch, seq) | cands_ents_id shape(batch, seq, n_cands)

'''


class DataUtil():
    def init(self):
        pass

    def process_global_data(self, dname, data_path, local_rep_path, group_path,
                            local_fea_path, seq_len, candidate_entity_num):
        """
        :param data_path: 
        :param local_rep_path: local context represents
        :param group_path: mention按文档划分的group
        :param seq_len: 文档中mention序列长度
        :param candidate_entity_num: 每个mention对应候选实体数量