示例#1
0
 def predict_dep(self, batch, unsort=True):
     dep_inputs, dep_number_of_words, dep_orig_idx, dep_sentlens = unpack_batch(batch, self.use_cuda, type="dep")
     dep_tokens_phobert, dep_first_subword, dep_words_mask, dep_head, dep_deprel = dep_inputs
     self.model.eval()
     batch_size = dep_tokens_phobert.size(0)
     loss_dep, preds = self.model.dep_forward(
         dep_tokens_phobert,
         dep_first_subword,
         dep_words_mask,
         dep_number_of_words,
         dep_sentlens,
         dep_head,
         dep_deprel,
         eval=True,
     )
     # dependency
     head_seqs = [
         chuliu_edmonds_one_root(adj[:l, :l])[1:] for adj, l in zip(preds[0], dep_sentlens)
     ]  # remove attachment for the root
     deprel_seqs = [
         self.vocab["deprel"].unmap([preds[1][i][j + 1][h] for j, h in enumerate(hs)])
         for i, hs in enumerate(head_seqs)
     ]
     pred_tokens = [
         [[str(head_seqs[i][j]), deprel_seqs[i][j]] for j in range(dep_sentlens[i] - 1)] for i in range(batch_size)
     ]
     if unsort:
         pred_tokens = util.unsort(pred_tokens, dep_orig_idx)
     return pred_tokens
示例#2
0
    def predict_pos(self, batch, unsort=True):
        pos_inputs, pos_orig_idx, pos_sentlens = unpack_batch(batch, self.use_cuda, type="pos")
        pos_tokens_phobert, pos_first_subword, pos_upos = pos_inputs
        self.model.eval()
        batch_size = pos_tokens_phobert.size(0)
        _, preds = self.model.pos_forward(pos_tokens_phobert, pos_first_subword, pos_sentlens, False, pos_upos)
        upos_seqs = [self.vocab["upos"].unmap(sent) for sent in preds[0].tolist()]

        pred_tokens = [
            [[upos_seqs[i][j]] for j in range(pos_sentlens[i])] for i in range(batch_size)
        ]  # , xpos_seqs[i][j], feats_seqs[i][j]
        if unsort:
            pred_tokens = util.unsort(pred_tokens, pos_orig_idx)
        return pred_tokens
示例#3
0
    def predict_ner(self, batch, unsort=True):
        ner_inputs, ner_orig_idx, ner_sentlens = unpack_batch(batch, self.use_cuda, type="ner")
        ner_tokens_phobert, ner_first_subword, ner_word_mask, ner_tags = ner_inputs

        self.model.eval()
        loss, logits = self.model.ner_forward(
            ner_tokens_phobert, ner_first_subword, ner_word_mask, ner_sentlens, ner_tags
        )

        # decode
        trans = self.model.crit_ner._transitions.data.cpu().numpy()
        scores = logits.data.cpu().numpy()
        bs = logits.size(0)
        tag_seqs = []
        for i in range(bs):
            tags, _ = viterbi_decode(scores[i, : ner_sentlens[i]], trans)
            tags = self.vocab["ner_tag"].unmap(tags)
            tag_seqs += [tags]
        if unsort:
            tag_seqs = util.unsort(tag_seqs, ner_orig_idx)
        return tag_seqs
示例#4
0
def evaluate(args):
    # file paths
    system_pred_file = args["output_file_dep"]
    gold_file = args["eval_file_dep"]
    model_file = args["save_dir"] + "/" + "phonlp.pt"

    checkpoint = torch.load(model_file, lambda storage, loc: storage)
    loaded_args = checkpoint["config"]
    vocab = MultiVocab.load_state_dict(checkpoint["vocab"])
    config_phobert = AutoConfig.from_pretrained(loaded_args["pretrained_lm"],
                                                output_hidden_states=True)
    tokenizer = AutoTokenizer.from_pretrained(loaded_args["pretrained_lm"],
                                              use_fast=False)

    # load model
    print("Loading model from: {}".format(model_file))
    use_cuda = args["cuda"] and not args["cpu"]
    trainer = JointTrainer(model_file=model_file,
                           use_cuda=use_cuda,
                           config_phobert=config_phobert)

    # load data
    print("Loading data with batch size {}...".format(args["batch_size"]))

    test_doc_dep = Document(CoNLL.conll2dict(input_file=args["eval_file_dep"]))

    test_batch_pos = DataLoaderPOS(
        args["eval_file_pos"],
        args["batch_size"],
        args,
        vocab=vocab,
        sort_during_eval=True,
        evaluation=True,
        tokenizer=tokenizer,
        max_seq_length=args["max_sequence_length"],
    )
    test_batch_dep = DataLoaderDep(
        test_doc_dep,
        args["batch_size"],
        args,
        vocab=vocab,
        sort_during_eval=True,
        evaluation=True,
        tokenizer=tokenizer,
        max_seq_length=args["max_sequence_length"],
    )
    test_batch_ner = DataLoaderNER(
        args["eval_file_ner"],
        args["batch_size"],
        args,
        vocab=vocab,
        evaluation=True,
        tokenizer=tokenizer,
        max_seq_length=args["max_sequence_length"],
    )

    print("Start evaluation...")
    # test_preds_dep = []
    test_preds_upos = []
    test_preds_ner = []
    # for batch in test_batch_dep:
    # preds_dep = trainer.predict_dep(batch)
    # test_preds_dep += preds_dep
    # test_preds_dep = util.unsort(test_preds_dep, test_batch_dep.data_orig_idx_dep)
    # test_batch_dep.doc_dep.set([HEAD, DEPREL], [y for x in test_preds_dep for y in x])
    # CoNLL.dict2conll(test_batch_dep.doc_dep.to_dict(), system_pred_file)
    # _, _, las, uas = score_dep.score(system_pred_file, gold_file)

    for batch in test_batch_pos:
        preds_pos = trainer.predict_pos(batch)
        test_preds_upos += preds_pos
    test_preds_upos = util.unsort(test_preds_upos,
                                  test_batch_pos.data_orig_idx_pos)
    accuracy_pos = score_pos.score_acc(test_preds_upos, test_batch_pos.upos)

    for batch in test_batch_ner:
        preds_ner = trainer.predict_ner(batch)
        test_preds_ner += preds_ner
    p, r, f1 = score_ner.score_by_entity(test_preds_ner, test_batch_ner.tags)

    # print(
    #     "{} POS tagging: {:.2f}, NER: {:.2f}, Dependency parsing: {:.2f}/{:.2f}".format(
    #         "Evaluation results: ", accuracy_pos * 100, f1 * 100, las * 100, uas * 100
    #     )
    # )
    print("{} POS tagging: {:.2f}, NER: {:.2f}".format("Evaluation results: ",
                                                       accuracy_pos * 100,
                                                       f1 * 100))
示例#5
0
def train(args):
    util.ensure_dir(args["save_dir"])
    model_file = args["save_dir"] + "/" + "phonlp.pt"

    tokenizer = AutoTokenizer.from_pretrained(args["pretrained_lm"],
                                              use_fast=False)
    config_phobert = AutoConfig.from_pretrained(args["pretrained_lm"],
                                                output_hidden_states=True)

    print("Loading data with batch size {}...".format(args["batch_size"]))
    train_doc_dep = Document(
        CoNLL.conll2dict(input_file=args["train_file_dep"]))
    vocab = BuildVocab(args, args["train_file_pos"], train_doc_dep,
                       args["train_file_ner"]).vocab

    train_batch_pos = DataLoaderPOS(
        args["train_file_pos"],
        args["batch_size"],
        args,
        vocab=vocab,
        evaluation=False,
        tokenizer=tokenizer,
        max_seq_length=args["max_sequence_length"],
    )
    train_batch_dep = DataLoaderDep(
        train_doc_dep,
        args["batch_size"],
        args,
        vocab=vocab,
        evaluation=False,
        tokenizer=tokenizer,
        max_seq_length=args["max_sequence_length"],
    )
    train_batch_ner = DataLoaderNER(
        args["train_file_ner"],
        args["batch_size"],
        args,
        vocab=vocab,
        evaluation=False,
        tokenizer=tokenizer,
        max_seq_length=args["max_sequence_length"],
    )

    dev_doc_dep = Document(CoNLL.conll2dict(input_file=args["eval_file_dep"]))

    dev_batch_pos = DataLoaderPOS(
        args["eval_file_pos"],
        args["batch_size"],
        args,
        vocab=vocab,
        sort_during_eval=True,
        evaluation=True,
        tokenizer=tokenizer,
        max_seq_length=args["max_sequence_length"],
    )
    dev_batch_dep = DataLoaderDep(
        dev_doc_dep,
        args["batch_size"],
        args,
        vocab=vocab,
        sort_during_eval=True,
        evaluation=True,
        tokenizer=tokenizer,
        max_seq_length=args["max_sequence_length"],
    )
    dev_batch_ner = DataLoaderNER(
        args["eval_file_ner"],
        args["batch_size"],
        args,
        vocab=vocab,
        evaluation=True,
        tokenizer=tokenizer,
        max_seq_length=args["max_sequence_length"],
    )

    # pred and gold path
    system_pred_file = args["output_file_dep"]
    gold_file = args["eval_file_dep"]

    # ##POS

    dev_gold_tags = dev_batch_ner.tags

    # skip training if the language does not have training or dev data
    if len(train_batch_pos) == 0 or len(dev_batch_pos) == 0:
        print("Skip training because no data available...")
        sys.exit(0)

    print("Training jointmodel...")
    trainer = JointTrainer(args, vocab, None, config_phobert,
                           args["cuda"])  # ###
    tsfm = trainer.model.phobert
    for child in tsfm.children():
        for param in child.parameters():
            if not param.requires_grad:
                print("whoopsies")
            param.requires_grad = True

    global_step = 0
    las_score_history = 0
    uas_score_history = 0
    upos_score_history = 0
    f1_score_history = 0
    ####

    # start training
    train_loss = 0
    train_loss_pos = 0
    train_loss_dep = 0
    train_loss_ner = 0

    # Creating optimizer and lr schedulers
    param_optimizer = list(trainer.model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            0.01
        },
        {
            "params":
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            "weight_decay":
            0.0
        },
    ]
    num_train_optimization_steps = int(
        args["num_epoch"] * len(train_batch_pos) / args["accumulation_steps"])
    optimizer = AdamW(
        optimizer_grouped_parameters, lr=args["lr"], correct_bias=False
    )  # To reproduce BertAdam specific behavior set correct_bias=False
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=5,
        num_training_steps=num_train_optimization_steps)
    get_constant_schedule(optimizer)
    for epoch in range(args["num_epoch"]):
        ####
        optimizer.zero_grad()
        print(" EPOCH  : ", epoch)
        step = 0
        lambda_pos = args["lambda_pos"]
        lambda_ner = args["lambda_ner"]
        lambda_dep = args["lambda_dep"]

        epoch_size = max(
            [len(train_batch_pos),
             len(train_batch_dep),
             len(train_batch_ner)])
        for i in tqdm(range(epoch_size)):
            step += 1
            global_step += 1
            batch_pos = train_batch_pos[i]
            batch_dep = train_batch_dep[i]
            batch_ner = train_batch_ner[i]
            ###
            loss, loss_pos, loss_ner = trainer.update(
                batch_dep,
                batch_pos,
                batch_ner,
                lambda_pos=lambda_pos,
                lambda_dep=lambda_dep,
                lambda_ner=lambda_ner)  # update step
            train_loss += loss
            train_loss_pos += loss_pos
            # train_loss_dep += loss_dep
            train_loss_ner += loss_ner
            ###

            if i % args["accumulation_steps"] == 0:
                optimizer.step()
                optimizer.zero_grad()
                scheduler.step()

            if epoch_size == len(train_batch_pos):
                if step % len(train_batch_dep) == 0:
                    train_batch_dep.reshuffle()
                if step % len(train_batch_ner) == 0:
                    train_batch_ner.reshuffle()
            elif epoch_size == len(train_batch_ner):
                if step % len(train_batch_dep) == 0:
                    train_batch_dep.reshuffle()
                if step % len(train_batch_pos) == 0:
                    train_batch_pos.reshuffle()
            elif epoch_size == len(train_batch_dep):
                if step % len(train_batch_pos) == 0:
                    train_batch_dep.reshuffle()
                if step % len(train_batch_ner) == 0:
                    train_batch_ner.reshuffle()
            if step % args["eval_interval"] == 0:
                print("Evaluating on dev set...")
                dev_preds_dep = []
                dev_preds_upos = []
                dev_preds_ner = []
                for batch in dev_batch_dep:
                    preds_dep = trainer.predict_dep(batch)
                    dev_preds_dep += preds_dep
                ###
                dev_preds_dep = util.unsort(dev_preds_dep,
                                            dev_batch_dep.data_orig_idx_dep)
                dev_batch_dep.doc_dep.set(
                    [HEAD, DEPREL], [y for x in dev_preds_dep for y in x])
                CoNLL.dict2conll(dev_batch_dep.doc_dep.to_dict(),
                                 system_pred_file)
                _, _, las_dev, uas_dev = score_dep.score(
                    system_pred_file, gold_file)

                for batch in dev_batch_pos:
                    preds_pos = trainer.predict_pos(batch)
                    dev_preds_upos += preds_pos
                dev_preds_upos = util.unsort(dev_preds_upos,
                                             dev_batch_pos.data_orig_idx_pos)
                accuracy_pos_dev = score_pos.score_acc(dev_preds_upos,
                                                       dev_batch_pos.upos)

                for batch in dev_batch_ner:
                    preds_ner = trainer.predict_ner(batch)
                    dev_preds_ner += preds_ner
                p, r, f1 = score_ner.score_by_entity(dev_preds_ner,
                                                     dev_gold_tags)
                for i in range(len(dev_batch_ner)):
                    assert len(dev_preds_ner[i]) == len(dev_gold_tags[i])

                print(
                    "step {}: dev_las_score = {:.4f}, dev_uas_score = {:.4f}, dev_pos = {:.4f}, dev_ner_p = {:.4f}, dev_ner_r = {:.4f}, dev_ner_f1 = {:.4f}"
                    .format(global_step, las_dev, uas_dev, accuracy_pos_dev, p,
                            r, f1))

                # save best model
                if las_dev + accuracy_pos_dev + f1 >= (las_score_history +
                                                       upos_score_history +
                                                       f1_score_history):
                    las_score_history = las_dev
                    upos_score_history = accuracy_pos_dev
                    uas_score_history = uas_dev
                    f1_score_history = f1
                    trainer.save(model_file)
                    print("new best model saved.")
                print("")

        print("Evaluating on dev set...")
        dev_preds_dep = []
        dev_preds_upos = []
        dev_preds_ner = []
        for batch in dev_batch_dep:
            preds_dep = trainer.predict_dep(batch)
            dev_preds_dep += preds_dep

        dev_preds_dep = util.unsort(dev_preds_dep,
                                    dev_batch_dep.data_orig_idx_dep)
        dev_batch_dep.doc_dep.set([HEAD, DEPREL],
                                  [y for x in dev_preds_dep for y in x])
        CoNLL.dict2conll(dev_batch_dep.doc_dep.to_dict(), system_pred_file)
        _, _, las_dev, uas_dev = score_dep.score(system_pred_file, gold_file)

        for batch in dev_batch_pos:
            preds_pos = trainer.predict_pos(batch)
            dev_preds_upos += preds_pos
        dev_preds_upos = util.unsort(dev_preds_upos,
                                     dev_batch_pos.data_orig_idx_pos)
        accuracy_pos_dev = score_pos.score_acc(dev_preds_upos,
                                               dev_batch_pos.upos)

        for batch in dev_batch_ner:
            preds_ner = trainer.predict_ner(batch)
            dev_preds_ner += preds_ner
        p, r, f1 = score_ner.score_by_entity(dev_preds_ner, dev_gold_tags)
        for i in range(len(dev_batch_ner)):
            assert len(dev_preds_ner[i]) == len(dev_gold_tags[i])

        train_loss = train_loss / len(train_batch_pos)  # avg loss per batch
        train_loss_dep = train_loss_dep / len(train_batch_pos)
        train_loss_pos = train_loss_pos / len(train_batch_pos)
        train_loss_ner = train_loss_ner / len(train_batch_pos)

        print(
            "step {}: train_loss = {:.6f}, train_loss_dep = {:.6f}, train_loss_pos = {:.6f}, train_loss_ner = {:.6f}, dev_las_score = {:.4f}, dev_uas_score = {:.4f}, dev_pos = {:.4f}, dev_ner_p = {:.4f}, dev_ner_r = {:.4f}, dev_ner_f1 = {:.4f} "
            .format(
                global_step,
                train_loss,
                train_loss_dep,
                train_loss_pos,
                train_loss_ner,
                las_dev,
                uas_dev,
                accuracy_pos_dev,
                p,
                r,
                f1,
            ))

        # save best model
        if las_dev + accuracy_pos_dev + f1 >= (
                las_score_history + upos_score_history + f1_score_history):
            las_score_history = las_dev
            upos_score_history = accuracy_pos_dev
            uas_score_history = uas_dev
            f1_score_history = f1
            trainer.save(model_file)
            print("new best model saved.")
        train_loss = 0
        train_loss_pos = 0
        train_loss_dep = 0
        train_loss_ner = 0

        print("")
        train_batch_dep.reshuffle()
        train_batch_pos.reshuffle()
        train_batch_ner.reshuffle()

    print("Training ended with {} epochs.".format(epoch))

    best_las, uas, upos, f1 = (
        las_score_history * 100,
        uas_score_history * 100,
        upos_score_history * 100,
        f1_score_history * 100,
    )
    print("Best dev las = {:.2f}, uas = {:.2f}, upos = {:.2f}, f1 = {:.2f}".
          format(best_las, uas, upos, f1))
示例#6
0
    def annotate(self,
                 text=None,
                 input_file=None,
                 output_file=None,
                 batch_size=1,
                 output_type=''):
        if text is not None:
            data = [text.split(' ')]
        else:
            f = open(input_file)
            data = []
            for line in f:
                line = line.strip()
                if len(line) != 0:
                    data.append(line.split(' '))
            f.close()
            print("The number of sentences: ", len(data))
        data_tagger = self.process_data_tagger(batch_text=data)
        data_parser = self.process_data_parser(batch_text=data)
        data_parser = self.chunk_batches(data_parser, batch_size)
        data_tagger = self.chunk_batches(data_tagger, batch_size)
        test_preds_pos = []
        test_preds_dep = []
        test_preds_ner = []
        for i in tqdm(range(len(data_tagger))):
            tokens_phobert, first_subword, words_mask, number_of_words, orig_idx, sentlens = self.get_batch(
                i, data_tagger)
            tokens_phobert1, first_subword1, words_mask1, number_of_words1, orig_idx1, sentlens1 = self.get_batch(
                i, data_parser)
            if torch.cuda.is_available():
                tokens_phobert, first_subword, words_mask = tokens_phobert.cuda(
                ), first_subword.cuda(), words_mask.cuda()
                tokens_phobert1, first_subword1, words_mask1 = tokens_phobert1.cuda(
                ), first_subword1.cuda(), words_mask1.cuda()

            preds_dep = self.dep_forward(tokens_phobert1, first_subword1,
                                         sentlens1)
            preds_pos, logits = self.tagger_forward(tokens_phobert,
                                                    first_subword, sentlens)
            batch_size = tokens_phobert.size(0)
            ##DEP
            head_seqs = [
                chuliu_edmonds_one_root(adj[:l, :l])[1:]
                for adj, l in zip(preds_dep[0], sentlens1)
            ]  # remove attachment for the root
            deprel_seqs = [
                self.vocab['deprel'].unmap(
                    [preds_dep[1][i][j + 1][h] for j, h in enumerate(hs)])
                for i, hs in enumerate(head_seqs)
            ]
            pred_tokens = [[[str(head_seqs[i][j]), deprel_seqs[i][j]]
                            for j in range(sentlens1[i] - 1)]
                           for i in range(batch_size)]
            pred_tokens_dep = util.unsort(pred_tokens, orig_idx1)

            ##POS
            upos_seqs = [
                self.vocab['upos'].unmap(sent) for sent in preds_pos[0]
            ]
            pred_tokens_pos = [[[upos_seqs[i][j]] for j in range(sentlens[i])]
                               for i in range(batch_size)
                               ]  # , xpos_seqs[i][j], feats_seqs[i][j]
            pred_tokens_pos = util.unsort(pred_tokens_pos, orig_idx)

            trans = self.crit_ner._transitions.data.cpu().numpy()
            scores = logits.data.cpu().numpy()
            bs = logits.size(0)
            tag_seqs = []
            for i in range(bs):
                tags, _ = viterbi_decode(scores[i, :sentlens[i]], trans)
                tags = self.vocab['ner_tag'].unmap(tags)
                tag_seqs += [tags]
            tag_seqs = util.unsort(tag_seqs, orig_idx)
            test_preds_ner += tag_seqs
            test_preds_dep += pred_tokens_dep
            test_preds_pos += pred_tokens_pos
        test_preds_dep = util.unsort(test_preds_dep, self.data_orig_idx)
        test_preds_pos = util.unsort(test_preds_pos, self.data_orig_idx)
        test_preds_ner = util.unsort(test_preds_ner, self.data_orig_idx)
        if text is not None:
            return (data, test_preds_pos, test_preds_ner, test_preds_dep)
        else:
            f = open(output_file, 'w')
            for i in range(len(data)):
                for j in range(len(data[i])):
                    if output_type == 'conll':
                        f.write(
                            str(j + 1) + '\t' + data[i][j] + '\t' + '_' +
                            '\t' + '_' + '\t' + test_preds_pos[i][j][0] +
                            '\t' + '_' + '\t' + test_preds_dep[i][j][0] +
                            '\t' + test_preds_dep[i][j][1] + '\t' + '_' +
                            '\t' + test_preds_ner[i][j] + '\n')
                    else:
                        f.write(
                            str(j + 1) + '\t' + data[i][j] + '\t' +
                            test_preds_pos[i][j][0] + '\t' +
                            test_preds_ner[i][j] + '\t' +
                            test_preds_dep[i][j][0] + '\t' +
                            test_preds_dep[i][j][1] + '\n')
                f.write('\n')
            f.close()