Пример #1
0
def do_eval(args):
    paddle.set_device(args.device)

    # Create dataset, tokenizer and dataloader.
    train_ds, eval_ds = load_dataset('msra_ner',
                                     splits=('train', 'test'),
                                     lazy=False)
    tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path)

    label_list = train_ds.label_list
    label_num = len(label_list)
    no_entity_id = label_num - 1
    trans_func = partial(tokenize_and_align_labels,
                         tokenizer=tokenizer,
                         no_entity_id=no_entity_id,
                         max_seq_len=args.max_seq_length)
    ignore_label = -100
    batchify_fn = lambda samples, fn=Dict({
        'input_ids':
        Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype='int32'),  # input
        'token_type_ids':
        Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype='int32'
            ),  # segment
        'seq_len':
        Stack(dtype='int64'),
        'labels':
        Pad(axis=0, pad_val=ignore_label, dtype='int64')  # label
    }): fn(samples)
    eval_ds = eval_ds.map(trans_func)
    eval_data_loader = DataLoader(dataset=eval_ds,
                                  collate_fn=batchify_fn,
                                  num_workers=0,
                                  batch_size=args.batch_size,
                                  return_list=True)

    # Define the model netword and its loss
    model = BertForTokenClassification.from_pretrained(args.model_name_or_path,
                                                       num_classes=label_num)
    if args.init_checkpoint_path:
        model_dict = paddle.load(args.init_checkpoint_path)
        model.set_dict(model_dict)
    loss_fct = paddle.nn.loss.CrossEntropyLoss(ignore_index=ignore_label)

    metric = ChunkEvaluator(label_list=label_list)

    model.eval()
    metric.reset()
    for step, batch in enumerate(eval_data_loader):
        input_ids, token_type_ids, length, labels = batch
        logits = model(input_ids, token_type_ids)
        loss = loss_fct(logits, labels)
        avg_loss = paddle.mean(loss)
        preds = logits.argmax(axis=2)
        num_infer_chunks, num_label_chunks, num_correct_chunks = metric.compute(
            length, preds, labels)
        metric.update(num_infer_chunks.numpy(), num_label_chunks.numpy(),
                      num_correct_chunks.numpy())
        precision, recall, f1_score = metric.accumulate()
    print("eval loss: %f, precision: %f, recall: %f, f1: %f" %
          (avg_loss, precision, recall, f1_score))
Пример #2
0
def main(args):
    paddle.set_device('gpu' if args.n_gpu else 'cpu')
    world_size = dist.get_world_size()
    rank = dist.get_rank()
    if world_size > 1 and args.do_train:
        dist.init_parallel_env()

    set_seed(args.seed)

    dataset_class, metric_class = TASK_CLASSES[args.task_name]
    tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path)
    trans_func = partial(dataset_class.convert_example,
                         tokenizer=tokenizer,
                         max_seq_length=args.max_seq_len)
    test_trans_func = partial(dataset_class.convert_example,
                              tokenizer=tokenizer,
                              max_seq_length=args.test_max_seq_len)
    metric = metric_class()

    if args.task_name in ('udc', 'dstc2', 'atis_intent', 'mrda', 'swda'):
        batchify_fn = lambda samples, fn=Tuple(
            Pad(axis=0, pad_val=tokenizer.pad_token_id),  # input
            Pad(axis=0, pad_val=tokenizer.pad_token_id),  # segment
            Stack(dtype='int64')  # label
        ): fn(samples)
        model = BertForSequenceClassification.from_pretrained(
            args.model_name_or_path, num_classes=dataset_class.num_classes())
    elif args.task_name == 'atis_slot':
        batchify_fn = lambda samples, fn=Tuple(
            Pad(axis=0, pad_val=tokenizer.pad_token_id),  # input
            Pad(axis=0, pad_val=tokenizer.pad_token_id),  # segment
            Pad(axis=0, pad_val=0, dtype='int64')  # label
        ): fn(samples)
        model = BertForTokenClassification.from_pretrained(
            args.model_name_or_path,
            num_classes=dataset_class.num_classes(),
            dropout=0.0)
    if world_size > 1 and args.do_train:
        model = paddle.DataParallel(model)

    if args.do_train:
        train_data_loader = create_data_loader(args, dataset_class, trans_func,
                                               batchify_fn, 'train')
        if args.do_eval:
            dev_data_loader = create_data_loader(args, dataset_class,
                                                 test_trans_func, batchify_fn,
                                                 'dev')
        else:
            dev_data_loader = None
        train(args, model, train_data_loader, dev_data_loader, metric, rank)

    if args.do_test:
        if rank == 0:
            test_data_loader = create_data_loader(args, dataset_class,
                                                  test_trans_func, batchify_fn,
                                                  'test')
            if args.do_train:
                # If do_eval=True, use best model to evaluate the test data.
                # Otherwise, use final model to evaluate the test data.
                if args.do_eval:
                    args.init_from_ckpt = os.path.join(args.output_dir, 'best')
                    load_ckpt(args, model)
            else:
                if not args.init_from_ckpt:
                    raise ValueError('"init_from_ckpt" should be set.')
                load_ckpt(args, model)
            print('\nTest begin...')
            evaluation(args, model, test_data_loader, metric)
Пример #3
0
    def run(self):
        args = self.args
        paddle.set_device(args.device)
        rank = paddle.distributed.get_rank()
        if paddle.distributed.get_world_size() > 1:
            paddle.distributed.init_parallel_env()

        # Reads label_map.
        # read B I O的id
        label_map_path = os.path.join(args.data_path, "predicate2id.json")
        if not (os.path.exists(label_map_path)
                and os.path.isfile(label_map_path)):
            sys.exit(
                "{} dose not exists or is not a file.".format(label_map_path))
        with open(label_map_path, 'r', encoding='utf8') as fp:
            label_map = json.load(fp)  # dict
        num_classes = (len(label_map.keys()) -
                       2) * 2 + 2  # 由于object和subject的区别 B标签*2+2

        # Loads pretrained model BERT
        model = BertForTokenClassification.from_pretrained(
            args.model_name_or_path, num_classes=num_classes)
        model = paddle.DataParallel(model)
        tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path)
        criterion = BCELossForDuIE()

        # Loads dataset.
        train_dataset = DuIEDataset.from_file(
            os.path.join(args.data_path, 'train_data.json'), tokenizer,
            args.max_seq_length, True)
        train_batch_sampler = paddle.io.DistributedBatchSampler(
            train_dataset,
            batch_size=args.batch_size,
            shuffle=True,
            drop_last=True)
        collator = DataCollator()
        train_data_loader = DataLoader(dataset=train_dataset,
                                       batch_sampler=train_batch_sampler,
                                       collate_fn=collator,
                                       return_list=True)

        eval_file_path = os.path.join(args.data_path, 'dev_data.json')
        test_dataset = DuIEDataset.from_file(eval_file_path, tokenizer,
                                             args.max_seq_length, True)
        test_batch_sampler = paddle.io.BatchSampler(test_dataset,
                                                    batch_size=args.batch_size,
                                                    shuffle=False,
                                                    drop_last=True)
        test_data_loader = DataLoader(dataset=test_dataset,
                                      batch_sampler=test_batch_sampler,
                                      collate_fn=collator,
                                      return_list=True)

        # Defines learning rate strategy.
        steps_by_epoch = len(train_data_loader)
        num_training_steps = steps_by_epoch * args.num_train_epochs
        lr_scheduler = LinearDecayWithWarmup(args.learning_rate,
                                             num_training_steps,
                                             args.warmup_ratio)
        # Generate parameter names needed to perform weight decay.
        # All bias and LayerNorm parameters are excluded.
        decay_params = [
            p.name for n, p in model.named_parameters()
            if not any(nd in n for nd in ["bias", "norm"])
        ]
        optimizer = paddle.optimizer.AdamW(
            learning_rate=lr_scheduler,
            parameters=model.parameters(),
            weight_decay=args.weight_decay,
            apply_decay_param_fun=lambda x: x in decay_params)

        # Starts training.
        global_step = 0
        logging_steps = 50
        save_steps = 10000
        tic_train = time.time()
        for epoch in range(args.num_train_epochs):
            print("\n=====start training of %d epochs=====" % epoch)
            tic_epoch = time.time()
            model.train()
            for step, batch in enumerate(train_data_loader):
                input_ids, seq_lens, tok_to_orig_start_index, tok_to_orig_end_index, labels = batch
                logits = model(input_ids=input_ids)
                mask = (input_ids != 0).logical_and(
                    (input_ids != 1)).logical_and((input_ids != 2))
                loss = criterion(logits, labels, mask)
                loss.backward()
                optimizer.step()
                lr_scheduler.step()
                optimizer.clear_grad()
                loss_item = loss.numpy().item()
                global_step += 1

                if global_step % logging_steps == 0 and rank == 0:
                    print(
                        "epoch: %d / %d, steps: %d / %d, loss: %f, speed: %.2f step/s"
                        % (epoch, args.num_train_epochs, step, steps_by_epoch,
                           loss_item, logging_steps /
                           (time.time() - tic_train)))
                    tic_train = time.time()

                if global_step % save_steps == 0 and rank == 0:
                    print("\n=====start evaluating ckpt of %d steps=====" %
                          global_step)
                    precision, recall, f1 = self.evaluate(
                        model, criterion, test_data_loader, eval_file_path,
                        "eval")
                    print("precision: %.2f\t recall: %.2f\t f1: %.2f\t" %
                          (100 * precision, 100 * recall, 100 * f1))
                    print("saving checkpoing model_%d.pdparams to %s " %
                          (global_step, args.output_dir))
                    paddle.save(
                        model.state_dict(),
                        os.path.join(args.output_dir,
                                     "model_%d.pdparams" % global_step))
                    model.train()  # back to train mode

            tic_epoch = time.time() - tic_epoch
            print("epoch time footprint: %d hour %d min %d sec" %
                  (tic_epoch // 3600,
                   (tic_epoch % 3600) // 60, tic_epoch % 60))

        # Does final evaluation.
        if rank == 0:
            print("\n=====start evaluating last ckpt of %d steps=====" %
                  global_step)
            precision, recall, f1 = self.evaluate(model, criterion,
                                                  test_data_loader,
                                                  eval_file_path, "eval")
            print("precision: %.2f\t recall: %.2f\t f1: %.2f\t" %
                  (100 * precision, 100 * recall, 100 * f1))
            paddle.save(
                model.state_dict(),
                os.path.join(args.output_dir,
                             "model_%d.pdparams" % global_step))
            print("\n=====training complete=====")
Пример #4
0
def do_predict(args):
    paddle.set_device("gpu" if args.use_gpu else "cpu")

    train_ds, predict_ds = load_dataset('msra_ner',
                                        splits=('train', 'test'),
                                        lazy=False)
    tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path)

    label_list = train_ds.label_list
    label_num = len(label_list)
    no_entity_id = label_num - 1
    trans_func = partial(tokenize_and_align_labels,
                         tokenizer=tokenizer,
                         no_entity_id=no_entity_id,
                         max_seq_len=args.max_seq_length)

    ignore_label = -100
    batchify_fn = lambda samples, fn=Dict(
        {
            'input_ids': Pad(axis=0, pad_val=tokenizer.pad_token_id),  # input
            'token_type_ids': Pad(axis=0, pad_val=tokenizer.pad_token_id
                                  ),  # segment
            'seq_len': Stack(),
            'labels': Pad(axis=0, pad_val=ignore_label)  # label
        }): fn(samples)
    raw_data = predict_ds.data

    id2label = dict(enumerate(predict_ds.label_list))

    predict_ds = predict_ds.map(trans_func)
    predict_data_loader = DataLoader(dataset=predict_ds,
                                     collate_fn=batchify_fn,
                                     num_workers=0,
                                     batch_size=args.batch_size,
                                     return_list=True)

    model = BertForTokenClassification.from_pretrained(args.model_name_or_path,
                                                       num_classes=label_num)
    if args.init_checkpoint_path:
        model_dict = paddle.load(args.init_checkpoint_path)
        model.set_dict(model_dict)

    model.eval()
    pred_list = []
    len_list = []
    for step, batch in enumerate(predict_data_loader):
        input_ids, token_type_ids, length, labels = batch
        logits = model(input_ids, token_type_ids)
        pred = paddle.argmax(logits, axis=-1)
        pred_list.append(pred.numpy())
        len_list.append(length.numpy())

    preds = parse_decodes(raw_data, id2label, pred_list, len_list)

    file_path = "results.txt"
    with open(file_path, "w", encoding="utf8") as fout:
        fout.write("\n".join(preds))
    # Print some examples
    print(
        "The results have been saved in the file: %s, some examples are shown below: "
        % file_path)
    print("\n".join(preds[:10]))
Пример #5
0
def do_predict(args):
    paddle.set_device("gpu" if args.use_gpu else "cpu")

    train_dataset, predict_dataset = ppnlp.datasets.MSRA_NER.get_datasets(
        ["train", "test"])
    tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path)

    label_list = train_dataset.get_labels()
    label_num = len(label_list)
    no_entity_id = label_num - 1
    trans_func = partial(convert_example,
                         tokenizer=tokenizer,
                         label_list=label_list,
                         no_entity_id=label_num - 1,
                         max_seq_length=args.max_seq_length)
    ignore_label = -100
    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token]),  # input
        Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token]),  # segment
        Stack(),  # length
        Pad(axis=0, pad_val=ignore_label)  # label
    ): fn(samples)
    raw_data = predict_dataset.data

    id2label = dict(enumerate(predict_dataset.get_labels()))

    predict_dataset = predict_dataset.apply(trans_func, lazy=True)
    predict_batch_sampler = paddle.io.BatchSampler(predict_dataset,
                                                   batch_size=args.batch_size,
                                                   shuffle=False,
                                                   drop_last=True)
    predict_data_loader = DataLoader(dataset=predict_dataset,
                                     batch_sampler=predict_batch_sampler,
                                     collate_fn=batchify_fn,
                                     num_workers=0,
                                     return_list=True)

    model = BertForTokenClassification.from_pretrained(args.model_name_or_path,
                                                       num_classes=label_num)
    if args.init_checkpoint_path:
        model_dict = paddle.load(args.init_checkpoint_path)
        model.set_dict(model_dict)

    model.eval()
    pred_list = []
    len_list = []
    for step, batch in enumerate(predict_data_loader):
        input_ids, segment_ids, length, labels = batch
        logits = model(input_ids, segment_ids)
        pred = paddle.argmax(logits, axis=-1)
        pred_list.append(pred.numpy())
        len_list.append(length.numpy())

    preds = parse_decodes(raw_data, id2label, pred_list, len_list)

    file_path = "results.txt"
    with open(file_path, "w", encoding="utf8") as fout:
        fout.write("\n".join(preds))
    # Print some examples
    print(
        "The results have been saved in the file: %s, some examples are shown below: "
        % file_path)
    print("\n".join(preds[:10]))
Пример #6
0
def do_train(args):
    paddle.set_device("gpu" if args.n_gpu else "cpu")
    if paddle.distributed.get_world_size() > 1:
        paddle.distributed.init_parallel_env()

    # Create dataset, tokenizer and dataloader.
    train_ds, test_ds = load_dataset('msra_ner',
                                     splits=('train', 'test'),
                                     lazy=False)

    tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path)

    label_list = train_ds.label_list
    label_num = len(label_list)
    no_entity_id = label_num - 1

    trans_func = partial(tokenize_and_align_labels,
                         tokenizer=tokenizer,
                         no_entity_id=no_entity_id,
                         max_seq_len=args.max_seq_length)

    train_ds = train_ds.map(trans_func)

    ignore_label = -100

    batchify_fn = lambda samples, fn=Dict(
        {
            'input_ids': Pad(axis=0, pad_val=tokenizer.pad_token_id),  # input
            'token_type_ids': Pad(axis=0, pad_val=tokenizer.pad_token_type_id
                                  ),  # segment
            'seq_len': Stack(),  # seq_len
            'labels': Pad(axis=0, pad_val=ignore_label)  # label
        }): fn(samples)

    train_batch_sampler = paddle.io.DistributedBatchSampler(
        train_ds, batch_size=args.batch_size, shuffle=True, drop_last=True)

    train_data_loader = DataLoader(dataset=train_ds,
                                   collate_fn=batchify_fn,
                                   num_workers=0,
                                   batch_sampler=train_batch_sampler,
                                   return_list=True)

    test_ds = test_ds.map(trans_func)

    test_data_loader = DataLoader(dataset=test_ds,
                                  collate_fn=batchify_fn,
                                  num_workers=0,
                                  batch_size=args.batch_size,
                                  return_list=True)

    # Define the model netword and its loss
    model = BertForTokenClassification.from_pretrained(args.model_name_or_path,
                                                       num_classes=label_num)
    if paddle.distributed.get_world_size() > 1:
        model = paddle.DataParallel(model)

    num_training_steps = args.max_steps if args.max_steps > 0 else len(
        train_data_loader) * args.num_train_epochs

    lr_scheduler = LinearDecayWithWarmup(args.learning_rate,
                                         num_training_steps, args.warmup_steps)

    # Generate parameter names needed to perform weight decay.
    # All bias and LayerNorm parameters are excluded.
    decay_params = [
        p.name for n, p in model.named_parameters()
        if not any(nd in n for nd in ["bias", "norm"])
    ]
    optimizer = paddle.optimizer.AdamW(
        learning_rate=lr_scheduler,
        epsilon=args.adam_epsilon,
        parameters=model.parameters(),
        weight_decay=args.weight_decay,
        apply_decay_param_fun=lambda x: x in decay_params)

    loss_fct = paddle.nn.loss.CrossEntropyLoss(ignore_index=ignore_label)

    metric = ChunkEvaluator(label_list=label_list)

    global_step = 0
    last_step = args.num_train_epochs * len(train_data_loader)
    tic_train = time.time()
    for epoch in range(args.num_train_epochs):
        for step, batch in enumerate(train_data_loader):
            global_step += 1
            input_ids, token_type_ids, _, labels = batch
            logits = model(input_ids, token_type_ids)
            loss = loss_fct(logits, labels)
            avg_loss = paddle.mean(loss)
            if global_step % args.logging_steps == 0:
                print(
                    "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s"
                    % (global_step, epoch, step, avg_loss, args.logging_steps /
                       (time.time() - tic_train)))
                tic_train = time.time()
            avg_loss.backward()
            optimizer.step()
            lr_scheduler.step()
            optimizer.clear_grad()
            if global_step % args.save_steps == 0 or global_step == last_step:
                if (not args.n_gpu > 1) or paddle.distributed.get_rank() == 0:
                    evaluate(model, loss_fct, metric, test_data_loader,
                             label_num)
                    paddle.save(
                        model.state_dict(),
                        os.path.join(args.output_dir,
                                     "model_%d.pdparams" % global_step))
Пример #7
0
def do_train(args):
    paddle.set_device("gpu" if args.n_gpu else "cpu")
    if paddle.distributed.get_world_size() > 1:
        paddle.distributed.init_parallel_env()

    train_dataset, dev_dataset = ppnlp.datasets.MSRA_NER.get_datasets(
        ["train", "dev"])
    tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path)

    label_list = train_dataset.get_labels()
    label_num = len(label_list)
    no_entity_id = label_num - 1
    trans_func = partial(convert_example,
                         tokenizer=tokenizer,
                         label_list=label_list,
                         no_entity_id=label_num - 1,
                         max_seq_length=args.max_seq_length)
    train_dataset = train_dataset.apply(trans_func, lazy=True)
    train_batch_sampler = paddle.io.DistributedBatchSampler(
        train_dataset,
        batch_size=args.batch_size,
        shuffle=True,
        drop_last=True)

    ignore_label = -100
    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token]),  # input
        Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token]),  # segment
        Stack(),  # length
        Pad(axis=0, pad_val=ignore_label)  # label
    ): fn(samples)
    train_data_loader = DataLoader(dataset=train_dataset,
                                   batch_sampler=train_batch_sampler,
                                   collate_fn=batchify_fn,
                                   num_workers=0,
                                   return_list=True)

    dev_dataset = dev_dataset.apply(trans_func, lazy=True)
    dev_batch_sampler = paddle.io.BatchSampler(dev_dataset,
                                               batch_size=args.batch_size,
                                               shuffle=False,
                                               drop_last=True)
    dev_data_loader = DataLoader(dataset=dev_dataset,
                                 batch_sampler=dev_batch_sampler,
                                 collate_fn=batchify_fn,
                                 num_workers=0,
                                 return_list=True)

    model = BertForTokenClassification.from_pretrained(args.model_name_or_path,
                                                       num_classes=label_num)
    if paddle.distributed.get_world_size() > 1:
        model = paddle.DataParallel(model)

    lr_scheduler = paddle.optimizer.lr.LambdaDecay(
        args.learning_rate,
        lambda current_step, num_warmup_steps=args.warmup_steps,
        num_training_steps=args.max_steps if args.max_steps > 0 else
        (len(train_data_loader) * args.num_train_epochs): float(
            current_step) / float(max(1, num_warmup_steps))
        if current_step < num_warmup_steps else max(
            0.0,
            float(num_training_steps - current_step) / float(
                max(1, num_training_steps - num_warmup_steps))))

    optimizer = paddle.optimizer.AdamW(
        learning_rate=lr_scheduler,
        epsilon=args.adam_epsilon,
        parameters=model.parameters(),
        weight_decay=args.weight_decay,
        apply_decay_param_fun=lambda x: x in [
            p.name for n, p in model.named_parameters()
            if not any(nd in n for nd in ["bias", "norm"])
        ])

    loss_fct = paddle.nn.loss.CrossEntropyLoss(ignore_index=ignore_label)
    metric = ChunkEvaluator(int(math.ceil((label_num + 1) / 2.0)), "IOB")

    global_step = 0
    tic_train = time.time()
    for epoch in range(args.num_train_epochs):
        for step, batch in enumerate(train_data_loader):
            input_ids, segment_ids, length, labels = batch
            logits = model(input_ids, segment_ids)
            loss = loss_fct(logits.reshape([-1, label_num]),
                            labels.reshape([-1]))
            avg_loss = paddle.mean(loss)
            if global_step % args.logging_steps == 0:
                print(
                    "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s"
                    % (global_step, epoch, step, avg_loss, args.logging_steps /
                       (time.time() - tic_train)))
                tic_train = time.time()
            avg_loss.backward()
            optimizer.step()
            lr_scheduler.step()
            optimizer.clear_gradients()
            if global_step % args.save_steps == 0:
                evaluate(model, loss_fct, metric, dev_data_loader, label_num)
                if (not args.n_gpu > 1) or paddle.distributed.get_rank() == 0:
                    paddle.save(
                        model.state_dict(),
                        os.path.join(args.output_dir,
                                     "model_%d.pdparams" % global_step))
            global_step += 1
Пример #8
0
def do_eval(args):
    paddle.set_device(args.device)

    # Create dataset, tokenizer and dataloader.
    train_ds, eval_ds = load_dataset('msra_ner', split=('train', 'test'))
    tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path)

    label_list = train_ds.features['ner_tags'].feature.names
    label_num = len(label_list)
    no_entity_id = 0

    def tokenize_and_align_labels(examples):
        tokenized_inputs = tokenizer(
            examples['tokens'],
            max_seq_len=args.max_seq_length,
            # We use this argument because the texts in our dataset are lists of words (with a label for each word).
            is_split_into_words=True,
            return_length=True)
        labels = []

        for i, label in enumerate(examples['ner_tags']):
            label_ids = label
            if len(tokenized_inputs['input_ids'][i]) - 2 < len(label_ids):
                label_ids = label_ids[:len(tokenized_inputs['input_ids'][i]) -
                                      2]
            label_ids = [no_entity_id] + label_ids + [no_entity_id]
            label_ids += [no_entity_id] * (
                len(tokenized_inputs['input_ids'][i]) - len(label_ids))

            labels.append(label_ids)
        tokenized_inputs["labels"] = labels
        return tokenized_inputs

    ignore_label = -100
    batchify_fn = lambda samples, fn=Dict({
        'input_ids':
        Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype='int32'),  # input
        'token_type_ids':
        Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype='int32'
            ),  # segment
        'seq_len':
        Stack(dtype='int64'),
        'labels':
        Pad(axis=0, pad_val=ignore_label, dtype='int64')  # label
    }): fn(samples)

    eval_ds = eval_ds.select(range(len(eval_ds) - 1))
    eval_ds = eval_ds.map(tokenize_and_align_labels, batched=True)
    eval_data_loader = DataLoader(dataset=eval_ds,
                                  collate_fn=batchify_fn,
                                  num_workers=0,
                                  batch_size=args.batch_size,
                                  return_list=True)

    # Define the model netword and its loss
    model = BertForTokenClassification.from_pretrained(args.model_name_or_path,
                                                       num_classes=label_num)
    if args.init_checkpoint_path:
        model_dict = paddle.load(args.init_checkpoint_path)
        model.set_dict(model_dict)
    loss_fct = paddle.nn.loss.CrossEntropyLoss(ignore_index=ignore_label)

    metric = ChunkEvaluator(label_list=label_list)

    model.eval()
    metric.reset()
    for step, batch in enumerate(eval_data_loader):
        input_ids, token_type_ids, length, labels = batch
        logits = model(input_ids, token_type_ids)
        loss = loss_fct(logits, labels)
        avg_loss = paddle.mean(loss)
        preds = logits.argmax(axis=2)
        num_infer_chunks, num_label_chunks, num_correct_chunks = metric.compute(
            length, preds, labels)
        metric.update(num_infer_chunks.numpy(), num_label_chunks.numpy(),
                      num_correct_chunks.numpy())
        precision, recall, f1_score = metric.accumulate()
    print("eval loss: %f, precision: %f, recall: %f, f1: %f" %
          (avg_loss, precision, recall, f1_score))
Пример #9
0
def do_predict(args):
    paddle.set_device(args.device)

    # Create dataset, tokenizer and dataloader.
    train_examples, predict_examples = load_dataset('msra_ner',
                                                    split=('train', 'test'))
    tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path)

    label_list = train_examples.features['ner_tags'].feature.names
    label_num = len(label_list)
    no_entity_id = 0

    def tokenize_and_align_labels(examples):
        tokenized_inputs = tokenizer(
            examples['tokens'],
            max_seq_len=args.max_seq_length,
            # We use this argument because the texts in our dataset are lists of words (with a label for each word).
            is_split_into_words=True,
            return_length=True)
        labels = []

        for i, label in enumerate(examples['ner_tags']):
            label_ids = label
            if len(tokenized_inputs['input_ids'][i]) - 2 < len(label_ids):
                label_ids = label_ids[:len(tokenized_inputs['input_ids'][i]) -
                                      2]
            label_ids = [no_entity_id] + label_ids + [no_entity_id]
            label_ids += [no_entity_id] * (
                len(tokenized_inputs['input_ids'][i]) - len(label_ids))

            labels.append(label_ids)
        tokenized_inputs["labels"] = labels
        return tokenized_inputs

    ignore_label = -100
    batchify_fn = lambda samples, fn=Dict(
        {
            'input_ids': Pad(axis=0, pad_val=tokenizer.pad_token_id),  # input
            'token_type_ids': Pad(axis=0, pad_val=tokenizer.pad_token_type_id
                                  ),  # segment
            'seq_len': Stack(),
            'labels': Pad(axis=0, pad_val=ignore_label)  # label
        }): fn(samples)

    id2label = dict(enumerate(label_list))

    predict_examples = predict_examples.select(
        range(len(predict_examples) - 1))
    predict_ds = predict_examples.map(tokenize_and_align_labels, batched=True)
    predict_data_loader = DataLoader(dataset=predict_ds,
                                     collate_fn=batchify_fn,
                                     num_workers=0,
                                     batch_size=args.batch_size,
                                     return_list=True)

    # Define the model netword
    model = BertForTokenClassification.from_pretrained(args.model_name_or_path,
                                                       num_classes=label_num)
    if args.init_checkpoint_path:
        model_dict = paddle.load(args.init_checkpoint_path)
        model.set_dict(model_dict)

    model.eval()
    pred_list = []
    len_list = []
    for step, batch in enumerate(predict_data_loader):
        input_ids, token_type_ids, length, labels = batch
        logits = model(input_ids, token_type_ids)
        pred = paddle.argmax(logits, axis=-1)
        pred_list.append(pred.numpy())
        len_list.append(length.numpy())

    preds = parse_decodes(predict_examples, id2label, pred_list, len_list)

    file_path = "results.txt"
    with open(file_path, "w", encoding="utf8") as fout:
        fout.write("\n".join(preds))
    # Print some examples
    print(
        "The results have been saved in the file: %s, some examples are shown below: "
        % file_path)
    print("\n".join(preds[:10]))
Пример #10
0
def do_train(args):
    paddle.set_device("gpu" if args.n_gpu else "cpu")
    if paddle.distributed.get_world_size() > 1:
        paddle.distributed.init_parallel_env()

    train_dataset, test_dataset = load_dataset(
        'msra_ner', splits=('train', 'test'), lazy=True)

    tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path)

    label_list = train_dataset.label_list
    label_num = len(label_list)
    no_entity_id = label_num - 1

    trans_func = partial(
        tokenize_and_align_labels,
        tokenizer=tokenizer,
        no_entity_id=no_entity_id,
        max_seq_len=args.max_seq_length)

    train_dataset = train_dataset.map(trans_func)

    train_dataset = train_dataset.shard()

    ignore_label = -100

    batchify_fn = lambda samples, fn=Dict({
        'input_ids': Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token]),  # input
        'segment_ids': Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token]),  # segment
        'seq_len': Stack(),
        'labels': Pad(axis=0, pad_val=ignore_label)  # label
    }): fn(samples)

    train_data_loader = DataLoader(
        dataset=train_dataset,
        collate_fn=batchify_fn,
        num_workers=0,
        batch_size=args.batch_size,
        return_list=True)

    test_dataset = test_dataset.map(trans_func)

    test_data_loader = DataLoader(
        dataset=test_dataset,
        collate_fn=batchify_fn,
        num_workers=0,
        batch_size=args.batch_size,
        return_list=True)

    model = BertForTokenClassification.from_pretrained(
        args.model_name_or_path, num_classes=label_num)
    if paddle.distributed.get_world_size() > 1:
        model = paddle.DataParallel(model)

    num_training_steps = 2812

    lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps,
                                         args.warmup_steps)

    optimizer = paddle.optimizer.AdamW(
        learning_rate=lr_scheduler,
        epsilon=args.adam_epsilon,
        parameters=model.parameters(),
        weight_decay=args.weight_decay,
        apply_decay_param_fun=lambda x: x in [
            p.name for n, p in model.named_parameters()
            if not any(nd in n for nd in ["bias", "norm"])
        ])

    loss_fct = paddle.nn.loss.CrossEntropyLoss(ignore_index=ignore_label)

    metric = ChunkEvaluator(label_list=label_list)

    global_step = 0
    tic_train = time.time()
    for epoch in range(args.num_train_epochs):
        for step, batch in enumerate(train_data_loader):
            global_step += 1
            input_ids, segment_ids, _, labels = batch
            logits = model(input_ids, segment_ids)
            loss = loss_fct(
                logits.reshape([-1, label_num]), labels.reshape([-1]))
            avg_loss = paddle.mean(loss)
            if global_step % args.logging_steps == 0:
                print(
                    "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s"
                    % (global_step, epoch, step, avg_loss,
                       args.logging_steps / (time.time() - tic_train)))
                tic_train = time.time()
            avg_loss.backward()
            optimizer.step()
            lr_scheduler.step()
            optimizer.clear_gradients()
            if global_step % args.save_steps == 0:
                if (not args.n_gpu > 1) or paddle.distributed.get_rank() == 0:
                    evaluate(model, loss_fct, metric, test_data_loader,
                             label_num)
                    paddle.save(model.state_dict(),
                                os.path.join(args.output_dir,
                                             "model_%d.pdparams" % global_step))

    # Save final model 
    if (global_step) % args.save_steps != 0:
        if (not args.n_gpu > 1) or paddle.distributed.get_rank() == 0:
            evaluate(model, loss_fct, metric, test_data_loader, label_num)
            paddle.save(model.state_dict(),
                        os.path.join(args.output_dir,
                                     "model_%d.pdparams" % global_step))