示例#1
0
def test():
    """Test Model in test file"""
    # load config
    config = Config()
    print('settings:\n', config)
    # load corpus
    print('loading corpus')
    vocab = load_vocab(config.vocab)
    label_dic = load_vocab(config.tri_cls_label_file)
    # load train and dev and test dataset
    test_data = read_corpus_tri_cls(config.tri_cls_test_file,
                                    max_length=config.max_length,
                                    vocab=vocab)
    test_ids = torch.LongTensor([temp[0] for temp in test_data])
    test_masks = torch.LongTensor([temp[1] for temp in test_data])
    test_types = torch.LongTensor([temp[2] for temp in test_data])
    test_tags = torch.LongTensor([temp[3] for temp in test_data])
    test_dataset = TensorDataset(test_ids, test_masks, test_types, test_tags)
    test_loader = DataLoader(test_dataset,
                             shuffle=False,
                             batch_size=config.batch_size)
    # init model
    model = BertQA(config.bert_path, 2)
    model = load_model(model, name=config.load_tri_cls_path)
    if config.use_cuda and torch.cuda.is_available():
        model.cuda()
    # test model
    evaluate(model, test_loader, 0, config)
示例#2
0
文件: main.py 项目: yingzhou20/nlp-hw
 config = Config()
 # data preprocess
 ace_preprocess()
 # load model
 label_dic = load_vocab(config.tri_id_label_file)
 tagset_size = len(label_dic)
 model_id = BertLstmCrf(config.bert_path,
                        tagset_size,
                        config.bert_embedding,
                        config.rnn_hidden,
                        config.rnn_layer,
                        dropout_ratio=config.dropout_ratio,
                        dropout1=config.dropout1,
                        use_cuda=config.use_cuda)
 model_id = load_model(model_id, name=config.load_path)
 model_cls = BertQA(config.bert_path, 2)
 model_cls = load_model(model_cls, name=config.load_tri_cls_path)
 # predict
 if not config.input_file:
     while True:
         # get input
         sent = input()
         if sent == 'exit':
             break
         # trigger identification
         pred_label = tri_id_pre(config, model_id, sent)
         for label in pred_label:
             print(label, end=' ')
         print()
         # trigger classification
         input_cls = sent + '|||' + ' '.join(pred_label)
示例#3
0
np.random.seed(seed)
torch.manual_seed(seed)
if n_gpu > 0:
    torch.cuda.manual_seed_all(seed)

if args.local_rank not in [-1, 0]:
    torch.distributed.barrier()

tokenizer = BertTokenizer.from_pretrained(bert_model,
                                          do_lower_case=do_lower_case)
num_train_optimization_steps = int(
    len(train_InputExamples) / batch_size /
    gradient_accumulation_steps) * num_epochs

model_qa = BertQA.from_pretrained(
    bert_model,
    cache_dir=os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE),
                           'distributed_{}'.format(args.local_rank)))

if args.local_rank == 0:
    torch.distributed.barrier()

model_qa.to(device)

train_features = bert_utils.convert_examples_to_features(
    train_InputExamples, MAX_SEQ_LENGTH, tokenizer)
all_input_ids = torch.tensor([f.input_ids for f in train_features],
                             dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in train_features],
                              dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in train_features],
                               dtype=torch.long)
示例#4
0
print()

batch_size = batch_size // gradient_accumulation_steps

random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if n_gpu > 0:
    torch.cuda.manual_seed_all(seed)

if args.local_rank not in [-1, 0]:
    torch.distributed.barrier()

tokenizer = BertTokenizer.from_pretrained(args.output_dir,
                                          do_lower_case=do_lower_case)
model_qa = BertQA.from_pretrained(args.output_dir)
model_qa.to(device)

dev_features = bert_utils.convert_examples_to_features(dev_InputExamples,
                                                       MAX_SEQ_LENGTH,
                                                       tokenizer)
all_input_ids = torch.tensor([f.input_ids for f in dev_features],
                             dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in dev_features],
                              dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in dev_features],
                               dtype=torch.long)
all_start_positions = torch.tensor([f.start_label_ids for f in dev_features],
                                   dtype=torch.long)
all_end_positions = torch.tensor([f.end_label_ids for f in dev_features],
                                 dtype=torch.long)
示例#5
0
def init_all(config, gpu_list, checkpoint, mode, *args, **params):
    result = {}

    logger.info("Begin to initialize dataset and formatter...")
    if mode == "train":
        init_formatter(config, ["train", "valid"], *args, **params)

        result["train_dataset"], result["valid_dataset"] = init_dataset(
            config, *args, **params)
    else:
        init_formatter(config, ["test"], *args, **params)
        result["test_dataset"] = init_test_dataset(config, *args, **params)

    logger.info("Begin to initialize models...")

    model = BertQA(config, gpu_list, *args, **params)
    optimizer = init_optimizer(model, config, *args, **params)
    trained_epoch = -1
    global_step = 0

    if len(gpu_list) > 0:
        model = model.cuda()

        try:
            model.init_multi_gpu(gpu_list, config, *args, **params)
        except Exception as e:
            logger.warning(
                "No init_multi_gpu implemented in the model, use single gpu instead."
            )

    try:
        parameters = torch.load(checkpoint)
        model.load_state_dict(parameters["model"])

        if mode == "train":
            trained_epoch = parameters["trained_epoch"]
            if config.get("train",
                          "optimizer") == parameters["optimizer_name"]:
                optimizer.load_state_dict(parameters["optimizer"])
            else:
                logger.warning(
                    "Optimizer changed, do not load parameters of optimizer.")

            if "global_step" in parameters:
                global_step = parameters["global_step"]

    except Exception as e:
        information = "Cannot load checkpoint file with error %s" % str(e)
        if mode == "test":
            logger.error(information)
            raise e
        else:
            logger.warning(information)

    result["model"] = model
    if mode == "train":
        result["optimizer"] = optimizer
        result["trained_epoch"] = trained_epoch
        result["output_function"] = basic_output_function
        result["global_step"] = global_step

    logger.info("Initialize done.")

    return result
示例#6
0
def train():
    """Train Model"""
    # load config
    config = Config()
    print('settings:\n', config)
    # load corpus
    print('loading corpus')
    vocab = load_vocab(config.vocab)
    label_dic = load_vocab(config.tri_cls_label_file)
    # load train and dev and test dataset
    train_data = read_corpus_tri_cls(config.tri_cls_train_file,
                                     max_length=config.max_length,
                                     vocab=vocab)
    train_ids = torch.LongTensor([temp[0] for temp in train_data])
    train_masks = torch.LongTensor([temp[1] for temp in train_data])
    train_types = torch.LongTensor([temp[2] for temp in train_data])
    train_tags = torch.LongTensor([temp[3] for temp in train_data])
    train_dataset = TensorDataset(train_ids, train_masks, train_types,
                                  train_tags)
    train_loader = DataLoader(train_dataset,
                              shuffle=True,
                              batch_size=config.batch_size)
    dev_data = read_corpus_tri_cls(config.tri_cls_dev_file,
                                   max_length=config.max_length,
                                   vocab=vocab)
    dev_ids = torch.LongTensor([temp[0] for temp in dev_data])
    dev_masks = torch.LongTensor([temp[1] for temp in dev_data])
    dev_types = torch.LongTensor([temp[2] for temp in dev_data])
    dev_tags = torch.LongTensor([temp[3] for temp in dev_data])
    dev_dataset = TensorDataset(dev_ids, dev_masks, dev_types, dev_tags)
    dev_loader = DataLoader(dev_dataset,
                            shuffle=True,
                            batch_size=config.batch_size)
    test_data = read_corpus_tri_cls(config.tri_cls_test_file,
                                    max_length=config.max_length,
                                    vocab=vocab)
    test_ids = torch.LongTensor([temp[0] for temp in test_data])
    test_masks = torch.LongTensor([temp[1] for temp in test_data])
    test_types = torch.LongTensor([temp[2] for temp in test_data])
    test_tags = torch.LongTensor([temp[3] for temp in test_data])
    test_dataset = TensorDataset(test_ids, test_masks, test_types, test_tags)
    test_loader = DataLoader(test_dataset,
                             shuffle=False,
                             batch_size=config.batch_size)
    # init model
    model = BertQA(config.bert_path, 2)
    if config.load_model:
        assert config.load_path is not None
        model = load_model(model, name=config.load_tri_cls_path)
    if config.use_cuda and torch.cuda.is_available():
        model.cuda()
    # train model
    print('begin training')
    model.train()
    optimizer = getattr(optim, config.optim)
    optimizer = optimizer(model.parameters(),
                          lr=config.lr,
                          weight_decay=config.weight_decay)
    eval_loss = 10000
    for epoch in tqdm.tqdm(range(config.base_epoch)):
        step = 0
        bar = tqdm.tqdm(enumerate(train_loader))
        for i, batch in bar:
            step += 1
            model.zero_grad()
            inputs, masks, type_masks, label = batch
            inputs, masks, type_masks, label = Variable(inputs), Variable(
                masks), Variable(type_masks), Variable(label)
            masks = masks.bool()
            if config.use_cuda and torch.cuda.is_available():
                inputs, masks, type_masks, label = inputs.cuda(), masks.cuda(
                ), type_masks.cuda(), label.cuda()
            feats = model(inputs, masks, type_masks)
            loss = model.loss(feats, label)
            loss.backward()
            optimizer.step()
            tqdm.tqdm.set_description(bar, desc="loss: %f" % loss.item())
        # save best model
        dev_loss_temp = evaluate(model, dev_loader, epoch, config)
        if dev_loss_temp < eval_loss:
            print('dev loss: ', eval_loss, ' -> ', dev_loss_temp)
            eval_loss = dev_loss_temp
            save_model(model, epoch, name='tri-cls--epoch:{}'.format(epoch))
    evaluate(model, test_loader, epoch, config)
示例#7
0
def predict(config=None, model=None, sent=None):
    """
    Input: results of trigger identification saved in config.tri_id_result_file or single sentence
    Output: results of trigger classification
            format: [(event type, trigger begin pos, trigger end pos) * num of triggers] * num of sentences
    """
    # load config
    if not config:
        config = Config()
    # load corpus
    vocab = load_vocab(config.vocab)
    label_dic = load_vocab(config.tri_cls_label_file)
    # load trained model
    if not model:
        model = BertQA(config.bert_path, 2)
        model = load_model(model, name=config.load_tri_cls_path)
    if config.use_cuda:
        model.cuda()
    if (not config.input_file) and sent:
        test_datas = read_corpus_tr_id(config.tri_id_result_file,
                                       max_length=config.max_length,
                                       label_dic=load_vocab(
                                           config.tri_id_label_file),
                                       vocab=vocab,
                                       content=[sent])
    else:
        # load trigger identification result
        test_datas = read_corpus_tr_id(config.tri_id_result_file,
                                       max_length=config.max_length,
                                       label_dic=load_vocab(
                                           config.tri_id_label_file),
                                       vocab=vocab)
    sent_saves = []
    for i, test_data in tqdm.tqdm(enumerate(test_datas)):
        sent_save = []  # save sentence’s triggers
        sent = test_data[3]
        triggers = test_data[4]
        # predict event type for each trigger
        for trigger in triggers:
            inputs, masks, type_masks = [], [], []
            # data preprocess, [CLS] trigger, [unused0] begin_pos end_pos [unused1], event type [SEP] sentence [SEP]
            for event_type in label_dic.keys():
                tokens_a = []
                for w in sent[trigger['begin_pos']:trigger['end_pos'] + 1]:
                    tokens_a.append(w.lower())
                tokens_a.extend([
                    ',', '[unused0]',
                    str(trigger['begin_pos']),
                    str(trigger['end_pos']), '[unused1]', ','
                ])
                for w in re.split('([:-])', event_type):
                    tokens_a.append(w.lower())
                tokens_b = sent
                if len(tokens_a) + len(tokens_b) > config.max_length - 3:
                    tokens_b = tokens_b[0:(config.max_length - 3 -
                                           len(tokens_a))]
                tokens_f = ['[CLS]'] + tokens_a + ['[SEP]'
                                                   ] + tokens_b + ['[SEP]']
                input_ids = [
                    int(vocab[i]) if i in vocab else int(vocab['[UNK]'])
                    for i in tokens_f
                ]
                input_mask = [1] * len(input_ids)
                type_mask = [0] * (2 + len(tokens_a)) + [1] * (
                    config.max_length - 2 - len(tokens_a))
                while len(input_ids) < config.max_length:
                    input_ids.append(0)
                    input_mask.append(0)
                inputs.append(input_ids)
                masks.append(input_mask)
                type_masks.append(type_mask)
            inputs, masks, type_masks = Variable(torch.LongTensor(inputs)), \
                                        Variable(torch.LongTensor(masks)), \
                                        Variable(torch.LongTensor(type_masks))
            masks = masks.bool()
            if config.use_cuda and torch.cuda.is_available():
                inputs, masks, type_masks = inputs.cuda(), masks.cuda(
                ), type_masks.cuda()
            # predict event type
            with torch.no_grad():
                feats_1 = model(inputs[:config.batch_size],
                                masks[:config.batch_size],
                                type_masks[:config.batch_size])
                feats_2 = model(inputs[config.batch_size:],
                                masks[config.batch_size:],
                                type_masks[config.batch_size:])
                feats = torch.cat([feats_1, feats_2])
            tag_score = torch.nn.functional.softmax(feats)
            pred_label = torch.argmax(tag_score,
                                      dim=0).cpu().numpy().tolist()[1]
            pred_label = list(label_dic.keys())[int(pred_label)]
            # save event type for trigger
            sent_save.append(
                (pred_label, trigger['begin_pos'], trigger['end_pos']))
        sent_saves.append(sent_save)
    # save result
    with open(config.tri_cls_result_file, 'w', encoding='utf-8') as f:
        for sent_save in sent_saves:
            for trigger in sent_save:
                event_type = trigger[0]
                begin_pos = trigger[1]
                end_pos = trigger[2]
                f.write(event_type + ' ' + str(begin_pos) + ' ' +
                        str(end_pos) + ', ')
            f.write('\n')
    # evaluate
    if config.gold_trigger_file:
        evaluate_cls(config.gold_trigger_file)
    return sent_saves