示例#1
0
    def __init__(
        self,
        task: str = None,
        load_checkpoint: str = None,
        label_map: Dict = None,
        num_classes: int = 2,
        **kwargs,
    ):
        super(Electra, self).__init__()
        if label_map:
            self.label_map = label_map
            self.num_classes = len(label_map)
        else:
            self.num_classes = num_classes

        if task == 'sequence_classification':
            task = 'seq-cls'
            logger.warning(
                "current task name 'sequence_classification' was renamed to 'seq-cls', "
                "'sequence_classification' has been deprecated and will be removed in the future.",
            )
        if task == 'seq-cls':
            self.model = ElectraForSequenceClassification.from_pretrained(
                pretrained_model_name_or_path='electra-small',
                num_classes=self.num_classes,
                **kwargs)
            self.criterion = paddle.nn.loss.CrossEntropyLoss()
            self.metric = paddle.metric.Accuracy()
        elif task == 'token-cls':
            self.model = ElectraForTokenClassification.from_pretrained(
                pretrained_model_name_or_path='electra-small',
                num_classes=self.num_classes,
                **kwargs)
            self.criterion = paddle.nn.loss.CrossEntropyLoss()
            self.metric = ChunkEvaluator(label_list=[
                self.label_map[i] for i in sorted(self.label_map.keys())
            ])
        elif task == 'text-matching':
            self.model = ElectraModel.from_pretrained(
                pretrained_model_name_or_path='electra-small', **kwargs)
            self.dropout = paddle.nn.Dropout(0.1)
            self.classifier = paddle.nn.Linear(
                self.model.config['hidden_size'] * 3, 2)
            self.criterion = paddle.nn.loss.CrossEntropyLoss()
            self.metric = paddle.metric.Accuracy()
        elif task is None:
            self.model = ElectraModel.from_pretrained(
                pretrained_model_name_or_path='electra-small', **kwargs)
        else:
            raise RuntimeError(
                "Unknown task {}, task should be one in {}".format(
                    task, self._tasks_supported))

        self.task = task

        if load_checkpoint is not None and os.path.isfile(load_checkpoint):
            state_dict = paddle.load(load_checkpoint)
            self.set_state_dict(state_dict)
            logger.info('Loaded parameters from %s' %
                        os.path.abspath(load_checkpoint))
示例#2
0
def do_eval(args):
    paddle.set_device(args.device)

    # Create dataset, tokenizer and dataloader.
    train_ds, eval_ds = load_dataset('msra_ner',
                                     splits=('train', 'test'),
                                     lazy=False)
    tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path)

    label_list = train_ds.label_list
    label_num = len(label_list)
    no_entity_id = label_num - 1
    trans_func = partial(tokenize_and_align_labels,
                         tokenizer=tokenizer,
                         no_entity_id=no_entity_id,
                         max_seq_len=args.max_seq_length)
    ignore_label = -100
    batchify_fn = lambda samples, fn=Dict({
        'input_ids':
        Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype='int32'),  # input
        'token_type_ids':
        Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype='int32'
            ),  # segment
        'seq_len':
        Stack(dtype='int64'),
        'labels':
        Pad(axis=0, pad_val=ignore_label, dtype='int64')  # label
    }): fn(samples)
    eval_ds = eval_ds.map(trans_func)
    eval_data_loader = DataLoader(dataset=eval_ds,
                                  collate_fn=batchify_fn,
                                  num_workers=0,
                                  batch_size=args.batch_size,
                                  return_list=True)

    # Define the model netword and its loss
    model = BertForTokenClassification.from_pretrained(args.model_name_or_path,
                                                       num_classes=label_num)
    if args.init_checkpoint_path:
        model_dict = paddle.load(args.init_checkpoint_path)
        model.set_dict(model_dict)
    loss_fct = paddle.nn.loss.CrossEntropyLoss(ignore_index=ignore_label)

    metric = ChunkEvaluator(label_list=label_list)

    model.eval()
    metric.reset()
    for step, batch in enumerate(eval_data_loader):
        input_ids, token_type_ids, length, labels = batch
        logits = model(input_ids, token_type_ids)
        loss = loss_fct(logits, labels)
        avg_loss = paddle.mean(loss)
        preds = logits.argmax(axis=2)
        num_infer_chunks, num_label_chunks, num_correct_chunks = metric.compute(
            length, preds, labels)
        metric.update(num_infer_chunks.numpy(), num_label_chunks.numpy(),
                      num_correct_chunks.numpy())
        precision, recall, f1_score = metric.accumulate()
    print("eval loss: %f, precision: %f, recall: %f, f1: %f" %
          (avg_loss, precision, recall, f1_score))
示例#3
0
def evaluate(args):
    paddle.set_device(args.device)

    # create dataset.
    test_ds = load_dataset(datafiles=(os.path.join(args.data_dir, 'test.tsv')))
    word_vocab = load_vocab(os.path.join(args.data_dir, 'word.dic'))
    label_vocab = load_vocab(os.path.join(args.data_dir, 'tag.dic'))
    # q2b.dic is used to replace DBC case to SBC case
    normlize_vocab = load_vocab(os.path.join(args.data_dir, 'q2b.dic'))

    trans_func = partial(
        convert_example,
        max_seq_len=args.max_seq_len,
        word_vocab=word_vocab,
        label_vocab=label_vocab,
        normlize_vocab=normlize_vocab)
    test_ds.map(trans_func)

    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=0, dtype='int64'),  # word_ids
        Stack(dtype='int64'),  # length
        Pad(axis=0, pad_val=0, dtype='int64'),  # label_ids
    ): fn(samples)

    # Create sampler for dataloader
    test_sampler = paddle.io.BatchSampler(
        dataset=test_ds,
        batch_size=args.batch_size,
        shuffle=False,
        drop_last=False)
    test_loader = paddle.io.DataLoader(
        dataset=test_ds,
        batch_sampler=test_sampler,
        return_list=True,
        collate_fn=batchify_fn)

    # Define the model network and metric evaluator
    model = BiGruCrf(args.emb_dim, args.hidden_size,
                     len(word_vocab), len(label_vocab))
    chunk_evaluator = ChunkEvaluator(label_list=label_vocab.keys(), suffix=True)

    # Load the model and start predicting
    model_dict = paddle.load(args.init_checkpoint)
    model.load_dict(model_dict)

    model.eval()
    chunk_evaluator.reset()
    for batch in test_loader:
        token_ids, length, labels = batch
        preds = model(token_ids, length)
        num_infer_chunks, num_label_chunks, num_correct_chunks = chunk_evaluator.compute(
            length, preds, labels)
        chunk_evaluator.update(num_infer_chunks.numpy(),
                               num_label_chunks.numpy(),
                               num_correct_chunks.numpy())
        precision, recall, f1_score = chunk_evaluator.accumulate()
    print("eval precision: %f, recall: %f, f1: %f" %
          (precision, recall, f1_score))
示例#4
0
def evaluate(args):
    place = paddle.CUDAPlace(0) if args.use_gpu else paddle.CPUPlace()
    paddle.set_device("gpu" if args.use_gpu else "cpu")

    # create dataset.
    test_dataset = LacDataset(args.data_dir, mode='test')
    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=0),  # word_ids
        Stack(),  # length
        Pad(axis=0, pad_val=0),  # label_ids
    ): fn(samples)

    # Create sampler for dataloader
    test_sampler = paddle.io.BatchSampler(dataset=test_dataset,
                                          batch_size=args.batch_size,
                                          shuffle=False,
                                          drop_last=False)
    test_loader = paddle.io.DataLoader(dataset=test_dataset,
                                       batch_sampler=test_sampler,
                                       places=place,
                                       return_list=True,
                                       collate_fn=batchify_fn)

    # Define the model network and metric evaluator
    network = BiGruCrf(args.emb_dim, args.hidden_size, test_dataset.vocab_size,
                       test_dataset.num_labels)
    inputs = InputSpec(shape=(-1, ), dtype="int16", name='inputs')
    lengths = InputSpec(shape=(-1, ), dtype="int16", name='lengths')
    model = paddle.Model(network, inputs=[inputs, lengths])
    chunk_evaluator = ChunkEvaluator(
        label_list=test_dataset.label_vocab.keys(), suffix=True)
    model.prepare(None, None, chunk_evaluator)

    # Load the model and start predicting
    model.load(args.init_checkpoint)
    model.evaluate(
        eval_data=test_loader,
        batch_size=args.batch_size,
        log_freq=100,
        verbose=2,
    )
示例#5
0
文件: eval.py 项目: jandyu/models-1
def evaluate(args):
    place = paddle.CUDAPlace(0) if args.use_gpu else paddle.CPUPlace()
    paddle.set_device("gpu" if args.use_gpu else "cpu")

    # create dataset.
    test_dataset = LacDataset(args.data_dir, mode='test')
    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=0),  # word_ids
        Stack(),  # length
        Pad(axis=0, pad_val=0),  # label_ids
    ): fn(samples)

    # Create sampler for dataloader
    test_sampler = paddle.io.BatchSampler(dataset=test_dataset,
                                          batch_size=args.batch_size,
                                          shuffle=False,
                                          drop_last=True)
    test_loader = paddle.io.DataLoader(dataset=test_dataset,
                                       batch_sampler=test_sampler,
                                       places=place,
                                       return_list=True,
                                       collate_fn=batchify_fn)

    # Define the model network and metric evaluator
    network = BiGruCrf(args.emb_dim, args.hidden_size, test_dataset.vocab_size,
                       test_dataset.num_labels)
    model = paddle.Model(network)
    chunk_evaluator = ChunkEvaluator(
        int(math.ceil((test_dataset.num_labels + 1) / 2.0)),
        "IOB")  # + 1 for SOS and EOS
    model.prepare(None, None, chunk_evaluator)

    # Load the model and start predicting
    model.load(args.init_checkpoint)
    model.evaluate(
        eval_data=test_loader,
        batch_size=args.batch_size,
        log_freq=100,
        verbose=2,
    )
示例#6
0
def test_ner_dataset(client):
    from paddlenlp.metrics import ChunkEvaluator
    from datasets import load_dataset
    import paddle

    dev_ds = load_dataset("msra_ner", split="test")

    import os
    if os.environ.get('https_proxy'):
        del os.environ['https_proxy']
    if os.environ.get('http_proxy'):
        del os.environ['http_proxy']

    print("Start infer...")
    metric = ChunkEvaluator(
        label_list=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'])
    idx = 0
    batch_size = 32
    max_len = len(dev_ds["tokens"]) - 1
    while idx < max_len:
        end_idx = idx + batch_size if idx + batch_size < max_len else max_len
        data = dev_ds["tokens"][idx:end_idx]
        ret = client.predict(feed_dict={"tokens": data})
        if ret.err_no != 0:
            raise ValueError("err_no", ret.err_no, "err_msg: ", ret.err_msg)
        # print("ret:", ret)
        if idx < batch_size * 2:
            print_ret(json.loads(ret.value[0]), data)

        # calculate metric
        preds = json.loads(ret.value[1])
        label_list = dev_ds["ner_tags"][idx:end_idx]
        label_list = label_pad(label_list, preds)
        label_list = paddle.to_tensor(label_list)
        preds = paddle.to_tensor(preds)
        seq_len = [preds.shape[1]] * preds.shape[0]

        num_infer_chunks, num_label_chunks, num_correct_chunks = metric.compute(
            paddle.to_tensor(seq_len), preds, label_list)
        metric.update(num_infer_chunks.numpy(), num_label_chunks.numpy(),
                      num_correct_chunks.numpy())
        idx += batch_size
        print(idx)

    res = metric.accumulate()
    print("acc: ", res)
示例#7
0
def train(args):
    paddle.set_device(args.device)

    trainer_num = paddle.distributed.get_world_size()
    if trainer_num > 1:
        paddle.distributed.init_parallel_env()
    rank = paddle.distributed.get_rank()
    # Create dataset.
    train_ds, test_ds = load_dataset(
        datafiles=(os.path.join(args.data_dir, 'train.tsv'),
                   os.path.join(args.data_dir, 'test.tsv')))

    word_vocab = load_vocab(os.path.join(args.data_dir, 'word.dic'))
    label_vocab = load_vocab(os.path.join(args.data_dir, 'tag.dic'))
    # q2b.dic is used to replace DBC case to SBC case
    normlize_vocab = load_vocab(os.path.join(args.data_dir, 'q2b.dic'))

    trans_func = partial(convert_example,
                         max_seq_len=args.max_seq_len,
                         word_vocab=word_vocab,
                         label_vocab=label_vocab,
                         normlize_vocab=normlize_vocab)
    train_ds.map(trans_func)
    test_ds.map(trans_func)

    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=word_vocab.get("[PAD]", 0), dtype='int64'
            ),  # word_ids
        Stack(dtype='int64'),  # length
        Pad(axis=0, pad_val=label_vocab.get("O", 0), dtype='int64'
            ),  # label_ids
    ): fn(samples)

    # Create sampler for dataloader
    train_sampler = paddle.io.DistributedBatchSampler(
        dataset=train_ds,
        batch_size=args.batch_size,
        shuffle=True,
        drop_last=True)
    train_loader = paddle.io.DataLoader(dataset=train_ds,
                                        batch_sampler=train_sampler,
                                        return_list=True,
                                        collate_fn=batchify_fn)

    test_sampler = paddle.io.BatchSampler(dataset=test_ds,
                                          batch_size=args.batch_size,
                                          shuffle=False,
                                          drop_last=False)
    test_loader = paddle.io.DataLoader(dataset=test_ds,
                                       batch_sampler=test_sampler,
                                       return_list=True,
                                       collate_fn=batchify_fn)

    # Define the model netword and its loss
    model = BiGruCrf(args.emb_dim,
                     args.hidden_size,
                     len(word_vocab),
                     len(label_vocab),
                     crf_lr=args.crf_lr)
    # Prepare optimizer, loss and metric evaluator
    optimizer = paddle.optimizer.Adam(learning_rate=args.base_lr,
                                      parameters=model.parameters())
    chunk_evaluator = ChunkEvaluator(label_list=label_vocab.keys(),
                                     suffix=True)

    if args.init_checkpoint:
        if os.path.exists(args.init_checkpoint):
            logger.info("Init checkpoint from %s" % args.init_checkpoint)
            model_dict = paddle.load(args.init_checkpoint)
            model.load_dict(model_dict)
        else:
            logger.info("Cannot init checkpoint from %s which doesn't exist" %
                        args.init_checkpoint)
    logger.info("Start training")
    # Start training
    global_step = 0
    last_step = args.epochs * len(train_loader)
    train_reader_cost = 0.0
    train_run_cost = 0.0
    total_samples = 0
    reader_start = time.time()
    max_f1_score = -1
    for epoch in range(args.epochs):
        for step, batch in enumerate(train_loader):
            train_reader_cost += time.time() - reader_start
            global_step += 1
            token_ids, length, label_ids = batch
            train_start = time.time()
            loss = model(token_ids, length, label_ids)
            avg_loss = paddle.mean(loss)
            train_run_cost += time.time() - train_start
            total_samples += args.batch_size
            if global_step % args.logging_steps == 0:
                logger.info(
                    "global step %d / %d, loss: %f, avg_reader_cost: %.5f sec, avg_batch_cost: %.5f sec, avg_samples: %.5f, ips: %.5f sequences/sec"
                    %
                    (global_step, last_step, avg_loss,
                     train_reader_cost / args.logging_steps,
                     (train_reader_cost + train_run_cost) / args.logging_steps,
                     total_samples / args.logging_steps, total_samples /
                     (train_reader_cost + train_run_cost)))
                train_reader_cost = 0.0
                train_run_cost = 0.0
                total_samples = 0
            avg_loss.backward()
            optimizer.step()
            optimizer.clear_grad()
            if global_step % args.save_steps == 0 or global_step == last_step:
                if rank == 0:
                    paddle.save(
                        model.state_dict(),
                        os.path.join(args.model_save_dir,
                                     "model_%d.pdparams" % global_step))
                    logger.info("Save %d steps model." % (global_step))
                    if args.do_eval:
                        precision, recall, f1_score = evaluate(
                            model, chunk_evaluator, test_loader)
                        if f1_score > max_f1_score:
                            max_f1_score = f1_score
                            paddle.save(
                                model.state_dict(),
                                os.path.join(args.model_save_dir,
                                             "best_model.pdparams"))
                            logger.info("Save best model.")

            reader_start = time.time()
示例#8
0
def train(args):
    if args.use_gpu:
        place = paddle.CUDAPlace(paddle.distributed.ParallelEnv().dev_id)
        paddle.set_device("gpu")
    else:
        place = paddle.CPUPlace()
        paddle.set_device("cpu")

    # create dataset.
    train_dataset = LacDataset(args.data_dir, mode='train')
    test_dataset = LacDataset(args.data_dir, mode='test')

    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=0),  # word_ids
        Stack(),  # length
        Pad(axis=0, pad_val=0),  # label_ids
    ): fn(samples)

    # Create sampler for dataloader
    train_sampler = paddle.io.DistributedBatchSampler(
        dataset=train_dataset,
        batch_size=args.batch_size,
        shuffle=True,
        drop_last=True)
    train_loader = paddle.io.DataLoader(
        dataset=train_dataset,
        batch_sampler=train_sampler,
        places=place,
        return_list=True,
        collate_fn=batchify_fn)

    test_sampler = paddle.io.BatchSampler(
        dataset=test_dataset,
        batch_size=args.batch_size,
        shuffle=False,
        drop_last=True)
    test_loader = paddle.io.DataLoader(
        dataset=test_dataset,
        batch_sampler=test_sampler,
        places=place,
        return_list=True,
        collate_fn=batchify_fn)

    # Define the model netword and its loss
    network = BiGruCrf(args.emb_dim, args.hidden_size, train_dataset.vocab_size,
                       train_dataset.num_labels)
    model = paddle.Model(network)

    # Prepare optimizer, loss and metric evaluator
    optimizer = paddle.optimizer.Adam(
        learning_rate=args.base_lr, parameters=model.parameters())
    crf_loss = LinearChainCrfLoss(network.crf.transitions)
    chunk_evaluator = ChunkEvaluator(
        int(math.ceil((train_dataset.num_labels + 1) / 2.0)),
        "IOB")  # + 1 for START and STOP
    model.prepare(optimizer, crf_loss, chunk_evaluator)
    if args.init_checkpoint:
        model.load(args.init_checkpoint)

    # Start training
    callback = paddle.callbacks.ProgBarLogger(log_freq=10, verbose=3)
    model.fit(train_data=train_loader,
              eval_data=test_loader,
              batch_size=args.batch_size,
              epochs=args.epochs,
              eval_freq=1,
              log_freq=10,
              save_dir=args.model_save_dir,
              save_freq=1,
              drop_last=True,
              shuffle=True,
              callbacks=callback)
示例#9
0
def run(args):
    paddle.set_device(args.device)
    if paddle.distributed.get_world_size() > 1:
        paddle.distributed.init_parallel_env()

    raw_datasets = load_dataset(args.task_name)

    tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
    train_ds = raw_datasets['train']
    column_names = train_ds.column_names

    label_list = train_ds.features['ner_tags'].feature.names
    label_num = len(label_list)

    batchify_fn = DataCollatorForTokenClassification(tokenizer=tokenizer)

    # Define the model netword and its loss
    model = AutoModelForTokenClassification.from_pretrained(
        args.model_name_or_path, num_classes=label_num)

    if paddle.distributed.get_world_size() > 1:
        model = paddle.DataParallel(model)

    def tokenize_and_align_labels(examples, no_entity_id=0):
        tokenized_inputs = tokenizer(
            examples['tokens'],
            max_seq_len=args.max_seq_length,
            # We use this argument because the texts in our dataset are lists of words (with a label for each word).
            is_split_into_words=True,
            return_length=True)
        labels = []

        for i, label in enumerate(examples['ner_tags']):
            label_ids = label
            if len(tokenized_inputs['input_ids'][i]) - 2 < len(label_ids):
                label_ids = label_ids[:len(tokenized_inputs['input_ids'][i]) -
                                      2]
            label_ids = [no_entity_id] + label_ids + [no_entity_id]
            label_ids += [no_entity_id] * (
                len(tokenized_inputs['input_ids'][i]) - len(label_ids))

            labels.append(label_ids)
        tokenized_inputs["labels"] = labels
        return tokenized_inputs

    test_ds = raw_datasets['test']
    test_ds = test_ds.select(range(len(test_ds) - 1))
    test_ds = test_ds.map(tokenize_and_align_labels,
                          batched=True,
                          remove_columns=column_names)
    test_data_loader = DataLoader(dataset=test_ds,
                                  collate_fn=batchify_fn,
                                  num_workers=0,
                                  batch_size=args.batch_size,
                                  return_list=True)

    if args.do_train:
        train_ds = train_ds.select(range(len(train_ds) - 1))

        train_ds = train_ds.map(tokenize_and_align_labels,
                                batched=True,
                                remove_columns=column_names)

        train_batch_sampler = paddle.io.DistributedBatchSampler(
            train_ds, batch_size=args.batch_size, shuffle=True, drop_last=True)

        train_data_loader = DataLoader(dataset=train_ds,
                                       collate_fn=batchify_fn,
                                       num_workers=0,
                                       batch_sampler=train_batch_sampler,
                                       return_list=True)

        num_training_steps = args.max_steps if args.max_steps > 0 else len(
            train_data_loader) * args.num_train_epochs

        lr_scheduler = LinearDecayWithWarmup(args.learning_rate,
                                             num_training_steps,
                                             args.warmup_steps)

        # Generate parameter names needed to perform weight decay.
        # All bias and LayerNorm parameters are excluded.
        decay_params = [
            p.name for n, p in model.named_parameters()
            if not any(nd in n for nd in ["bias", "norm"])
        ]
        optimizer = paddle.optimizer.AdamW(
            learning_rate=lr_scheduler,
            epsilon=args.adam_epsilon,
            parameters=model.parameters(),
            weight_decay=args.weight_decay,
            apply_decay_param_fun=lambda x: x in decay_params)

        loss_fct = paddle.nn.loss.CrossEntropyLoss()

        metric = ChunkEvaluator(label_list=label_list)

        global_step = 0
        best_f1 = 0.0
        last_step = args.num_train_epochs * len(train_data_loader)
        tic_train = time.time()
        for epoch in range(args.num_train_epochs):
            for step, batch in enumerate(train_data_loader):
                global_step += 1
                logits = model(batch['input_ids'], batch['token_type_ids'])
                loss = loss_fct(logits, batch['labels'])
                avg_loss = paddle.mean(loss)
                if global_step % args.logging_steps == 0:
                    print(
                        "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s"
                        % (global_step, epoch, step, avg_loss,
                           args.logging_steps / (time.time() - tic_train)))
                    tic_train = time.time()
                avg_loss.backward()
                optimizer.step()
                lr_scheduler.step()
                optimizer.clear_grad()
                if global_step % args.save_steps == 0 or global_step == num_training_steps:
                    if paddle.distributed.get_rank() == 0:
                        f1 = evaluate(model, loss_fct, metric,
                                      test_data_loader, label_num, "test")
                        if f1 > best_f1:
                            best_f1 = f1
                            output_dir = args.output_dir
                            if not os.path.exists(output_dir):
                                os.makedirs(output_dir)
                            # Need better way to get inner model of DataParallel
                            model_to_save = model._layers if isinstance(
                                model, paddle.DataParallel) else model
                            model_to_save.save_pretrained(output_dir)
                            tokenizer.save_pretrained(output_dir)
                if global_step >= num_training_steps:
                    print("best_f1: ", best_f1)
                    return
        print("best_f1: ", best_f1)

    if args.do_eval:
        eval_data_loader = DataLoader(dataset=test_ds,
                                      collate_fn=batchify_fn,
                                      num_workers=0,
                                      batch_size=args.batch_size,
                                      return_list=True)

        # Define the model netword and its loss
        model = AutoModelForTokenClassification.from_pretrained(
            args.model_name_or_path, num_classes=label_num)
        loss_fct = paddle.nn.loss.CrossEntropyLoss()

        metric = ChunkEvaluator(label_list=label_list)

        model.eval()
        metric.reset()
        for step, batch in enumerate(eval_data_loader):
            logits = model(batch["input_ids"], batch["token_type_ids"])
            loss = loss_fct(logits, batch["labels"])
            avg_loss = paddle.mean(loss)
            preds = logits.argmax(axis=2)
            num_infer_chunks, num_label_chunks, num_correct_chunks = metric.compute(
                batch["length"], preds, batch["labels"])
            metric.update(num_infer_chunks.numpy(), num_label_chunks.numpy(),
                          num_correct_chunks.numpy())
            precision, recall, f1_score = metric.accumulate()
        print("eval loss: %f, precision: %f, recall: %f, f1: %f" %
              (avg_loss, precision, recall, f1_score))
示例#10
0
    # load dev data
    model_name = "skep_ernie_1.0_large_ch"
    label2id, id2label = load_dict(args.label_path)
    test_ds = load_dataset(read, data_path=args.test_path, lazy=False)

    tokenizer = SkepTokenizer.from_pretrained(model_name)
    trans_func = partial(convert_example_to_feature, tokenizer=tokenizer, label2id=label2id, max_seq_len=args.max_seq_len)
    test_ds = test_ds.map(trans_func, lazy=False)

    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64"),
        Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype="int64"),
        Stack(dtype="int64"),
        Pad(axis=0, pad_val=-1, dtype="int64")
    ): fn(samples)

    test_batch_sampler = paddle.io.BatchSampler(test_ds, batch_size=args.batch_size, shuffle=False)
    test_loader = paddle.io.DataLoader(test_ds, batch_sampler=test_batch_sampler, collate_fn=batchify_fn)

    # load model
    loaded_state_dict = paddle.load(args.model_path)
    model = SkepForTokenClassification.from_pretrained(model_name, num_classes=len(label2id))
    model.load_dict(loaded_state_dict)

    metric = ChunkEvaluator(label2id.keys())

    # evalute on dev data
    precision, recall, f1  = evaluate(model, test_loader,  metric)
    print(f'evalution result: precision: {precision:.5f}, recall: {recall:.5f},  F1: {f1:.5f}')
示例#11
0
def _dynabert_training(self, task_name, ofa_model, model, teacher_model,
                       train_dataloader, eval_dataloader, width_mult_list,
                       criterion, num_train_epochs, output_dir):
    metric = Accuracy()
    if task_name == "msra_ner":
        metric = ChunkEvaluator(label_list=self.train_dataset.label_list)

    @paddle.no_grad()
    def evaluate(model, criterion, data_loader, width_mult=1.0):
        model.eval()
        all_start_logits = []
        all_end_logits = []
        metric.reset()
        for batch in data_loader:
            if "cmrc2018" in task_name:
                input_ids, token_type_ids = batch['input_ids'], batch[
                    'token_type_ids']
                logits = model(
                    input_ids, token_type_ids, attention_mask=[None, None])
                if width_mult == 100:
                    start_logits_tensor, end_logits_tensor = logits
                else:
                    start_logits_tensor, end_logits_tensor = logits[0]
                for idx in range(start_logits_tensor.shape[0]):
                    if len(all_start_logits) % 1000 == 0 and len(
                            all_start_logits):
                        logger.info("Processing example: %d" %
                                    len(all_start_logits))
                    all_start_logits.append(start_logits_tensor.numpy()[idx])
                    all_end_logits.append(end_logits_tensor.numpy()[idx])

            else:
                input_ids, segment_ids, labels = batch['input_ids'], batch[
                    'token_type_ids'], batch['labels']
                logits = model(
                    input_ids, segment_ids, attention_mask=[None, None])
                if isinstance(logits, tuple):
                    logits = logits[0]
                loss = criterion(logits, labels)
                if task_name == "msra_ner":
                    preds = logits.argmax(axis=2)
                    num_infer_chunks, num_label_chunks, num_correct_chunks = metric.compute(
                        batch['seq_len'], preds, batch['labels'])
                    metric.update(num_infer_chunks.numpy(),
                                  num_label_chunks.numpy(),
                                  num_correct_chunks.numpy())
                else:
                    correct = metric.compute(logits, labels)
                    metric.update(correct)
        if "cmrc2018" in task_name:
            n_best_size = 20
            max_answer_length = 50
            all_predictions, _, _ = compute_prediction(
                self.eval_examples, self.eval_dataset,
                (all_start_logits, all_end_logits), False, n_best_size,
                max_answer_length)
            res = squad_evaluate(
                examples=[raw_data for raw_data in self.eval_examples],
                preds=all_predictions,
                is_whitespace_splited=False)
            if width_mult == 100:
                logger.info("teacher model, EM: %f, F1: %f" %
                            (res['exact'], res['f1']))
            else:
                logger.info("width_mult: %s, EM: %f, F1: %f, " %
                            (str(width_mult), res['exact'], res['f1']))
            res = res['exact']
        else:
            res = metric.accumulate()
            # Teacher model's evaluation
            if task_name == "msra_ner":
                if width_mult == 100:
                    logger.info(
                        "teacher model, eval loss: %f, precision: %f, recall: %f, f1_score: %f"
                        % (paddle.mean(loss).numpy(), res[0], res[1], res[2]))
                else:
                    logger.info(
                        "width_mult: %s, eval loss: %f, precision: %f, recall: %f, f1_score: %f"
                        % (str(width_mult), paddle.mean(loss).numpy(), res[0],
                           res[1], res[2]))
                res = res[2]
            else:
                if width_mult == 100:
                    logger.info("teacher model, eval loss: %f, acc: %s, " %
                                (loss.numpy(), res))
                else:
                    logger.info("width_mult: %s, eval loss: %f, acc: %s, " %
                                (str(width_mult), loss.numpy(), res))
        model.train()
        return res

    from paddleslim.nas.ofa import OFA, DistillConfig, utils
    global_step = 0
    lambda_logit = 1.0
    tic_train = time.time()
    best_acc = 0.0
    acc = 0.0
    logger.info("DynaBERT training starts. This period will cost some time.")
    for epoch in range(num_train_epochs):
        # Step7: Set current epoch and task.
        ofa_model.set_epoch(epoch)
        ofa_model.set_task('width')

        for step, batch in enumerate(train_dataloader):
            global_step += 1
            if "cmrc2018" in task_name:
                input_ids, token_type_ids, start_positions, end_positions = batch[
                    'input_ids'], batch['token_type_ids'], batch[
                        'start_positions'], batch['end_positions']
            else:
                input_ids, token_type_ids, labels = batch['input_ids'], batch[
                    'token_type_ids'], batch['labels']
            for width_mult in width_mult_list:
                # Step8: Broadcast supernet config from width_mult,
                # and use this config in supernet training.
                net_config = utils.dynabert_config(ofa_model, width_mult)
                ofa_model.set_net_config(net_config)
                logits, teacher_logits = ofa_model(
                    input_ids, token_type_ids, attention_mask=[None, None])
                rep_loss = ofa_model.calc_distill_loss()
                if "cmrc2018" in task_name:
                    logit_loss = (soft_cross_entropy(logits[0], teacher_logits[0].detach()) \
                                + \
                                soft_cross_entropy(logits[1], teacher_logits[1].detach()))/2
                else:
                    logit_loss = soft_cross_entropy(logits,
                                                    teacher_logits.detach())
                loss = rep_loss + lambda_logit * logit_loss
                loss.backward()
            self.optimizer.step()
            self.lr_scheduler.step()
            self.optimizer.clear_grad()

            if global_step % self.args.logging_steps == 0:
                if paddle.distributed.get_rank() == 0:
                    logger.info(
                        "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s"
                        % (global_step, epoch, step, loss,
                           self.args.logging_steps / (time.time() - tic_train)))
                tic_train = time.time()

            if "cmrc2018" not in task_name and global_step % self.args.save_steps == 0:
                tic_eval = time.time()

                evaluate(
                    teacher_model, criterion, eval_dataloader, width_mult=100)
                logger.info("eval done total : %s s" % (time.time() - tic_eval))
                for idx, width_mult in enumerate(width_mult_list):
                    net_config = utils.dynabert_config(ofa_model, width_mult)
                    ofa_model.set_net_config(net_config)
                    tic_eval = time.time()
                    acc = evaluate(ofa_model, criterion, eval_dataloader,
                                   width_mult)
                    if acc > best_acc:
                        best_acc = acc
                        if paddle.distributed.get_rank() == 0:
                            output_dir_width = os.path.join(output_dir,
                                                            str(width_mult))
                            if not os.path.exists(output_dir_width):
                                os.makedirs(output_dir_width)
                            # need better way to get inner model of DataParallel
                            model_to_save = model._layers if isinstance(
                                model, paddle.DataParallel) else model
                            model_to_save.save_pretrained(output_dir_width)
                    logger.info("eval done total : %s s" %
                                (time.time() - tic_eval))
            if global_step > self.args.num_training_steps:
                if best_acc == 0.0:
                    output_dir_width = os.path.join(output_dir, str(width_mult))
                    if not os.path.exists(output_dir_width):
                        os.makedirs(output_dir_width)
                    # need better way to get inner model of DataParallel
                    model_to_save = model._layers if isinstance(
                        model, paddle.DataParallel) else model
                    model_to_save.save_pretrained(output_dir_width)
                logger.info("Best acc: %.4f" % (best_acc))
                return ofa_model

        if "cmrc2018" in task_name:
            tic_eval = time.time()
            evaluate(teacher_model, criterion, eval_dataloader, width_mult=100)
            logger.info("eval done total : %s s" % (time.time() - tic_eval))
            for idx, width_mult in enumerate(width_mult_list):
                net_config = utils.dynabert_config(ofa_model, width_mult)
                ofa_model.set_net_config(net_config)
                tic_eval = time.time()
                acc = evaluate(ofa_model, criterion, eval_dataloader,
                               width_mult)
                if acc > best_acc:
                    best_acc = acc
                    if paddle.distributed.get_rank() == 0:
                        output_dir_width = os.path.join(output_dir,
                                                        str(width_mult))
                        if not os.path.exists(output_dir_width):
                            os.makedirs(output_dir_width)
                        # need better way to get inner model of DataParallel
                        model_to_save = model._layers if isinstance(
                            model, paddle.DataParallel) else model
                        model_to_save.save_pretrained(output_dir_width)
                logger.info("eval done total : %s s" % (time.time() - tic_eval))

    logger.info("Best acc: %.4f" % (best_acc))
    return ofa_model
示例#12
0
class ErnieV2(nn.Layer):
    """
    Ernie model
    """
    def __init__(
        self,
        task: str = None,
        load_checkpoint: str = None,
        label_map: Dict = None,
        num_classes: int = 2,
        **kwargs,
    ):
        super(ErnieV2, self).__init__()
        if label_map:
            self.label_map = label_map
            self.num_classes = len(label_map)
        else:
            self.num_classes = num_classes

        if task == 'sequence_classification':
            task = 'seq-cls'
            logger.warning(
                "current task name 'sequence_classification' was renamed to 'seq-cls', "
                "'sequence_classification' has been deprecated and will be removed in the future.",
            )
        if task == 'seq-cls':
            self.model = ErnieForSequenceClassification.from_pretrained(
                pretrained_model_name_or_path='ernie-2.0-en',
                num_classes=self.num_classes,
                **kwargs)
            self.criterion = paddle.nn.loss.CrossEntropyLoss()
            self.metric = paddle.metric.Accuracy()
        elif task == 'token-cls':
            self.model = ErnieForTokenClassification.from_pretrained(
                pretrained_model_name_or_path='ernie-2.0-en',
                num_classes=self.num_classes,
                **kwargs)
            self.criterion = paddle.nn.loss.CrossEntropyLoss()
            self.metric = ChunkEvaluator(label_list=[
                self.label_map[i] for i in sorted(self.label_map.keys())
            ])
        elif task is None:
            self.model = ErnieModel.from_pretrained(
                pretrained_model_name_or_path='ernie-2.0-en', **kwargs)
        else:
            raise RuntimeError(
                "Unknown task {}, task should be one in {}".format(
                    task, self._tasks_supported))

        self.task = task

        if load_checkpoint is not None and os.path.isfile(load_checkpoint):
            state_dict = paddle.load(load_checkpoint)
            self.set_state_dict(state_dict)
            logger.info('Loaded parameters from %s' %
                        os.path.abspath(load_checkpoint))

    def forward(self,
                input_ids,
                token_type_ids=None,
                position_ids=None,
                attention_mask=None,
                seq_lengths=None,
                labels=None):
        result = self.model(input_ids, token_type_ids, position_ids,
                            attention_mask)
        if self.task == 'seq-cls':
            logits = result
            probs = F.softmax(logits, axis=1)
            if labels is not None:
                loss = self.criterion(logits, labels)
                correct = self.metric.compute(probs, labels)
                acc = self.metric.update(correct)
                return probs, loss, {'acc': acc}
            return probs
        elif self.task == 'token-cls':
            logits = result
            token_level_probs = F.softmax(logits, axis=-1)
            preds = token_level_probs.argmax(axis=-1)
            if labels is not None:
                loss = self.criterion(logits, labels.unsqueeze(-1))
                num_infer_chunks, num_label_chunks, num_correct_chunks = \
                    self.metric.compute(None, seq_lengths, preds, labels)
                self.metric.update(num_infer_chunks.numpy(),
                                   num_label_chunks.numpy(),
                                   num_correct_chunks.numpy())
                _, _, f1_score = map(float, self.metric.accumulate())
                return token_level_probs, loss, {'f1_score': f1_score}
            return token_level_probs
        else:
            sequence_output, pooled_output = result
            return sequence_output, pooled_output

    @staticmethod
    def get_tokenizer(*args, **kwargs):
        """
        Gets the tokenizer that is customized for this module.
        """
        return ErnieTokenizer.from_pretrained(
            pretrained_model_name_or_path='ernie-2.0-en', *args, **kwargs)
def do_train(args):
    set_seed(args)
    tokenizer_class, eval_name, test_name, = DATASET_INFO[args.dataset]
    tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path)
    train_ds, eval_ds, test_ds = load_dataset(
        args.dataset, splits=["train", eval_name, test_name])
    num_classes = len(train_ds.label_list)
    no_entity_id = num_classes - 1

    paddle.set_device(args.device)
    trainer_num = paddle.distributed.get_world_size()
    if trainer_num > 1:
        paddle.distributed.init_parallel_env()
    rank = paddle.distributed.get_rank()
    if rank == 0:
        if os.path.exists(args.model_name_or_path):
            logger.info("init checkpoint from %s" % args.model_name_or_path)
    model = ErnieDocForTokenClassification.from_pretrained(
        args.model_name_or_path, num_classes=num_classes)
    model_config = model.ernie_doc.config
    if trainer_num > 1:
        model = paddle.DataParallel(model)

    train_ds_iter = SequenceLabelingIterator(
        train_ds,
        args.batch_size,
        tokenizer,
        trainer_num,
        trainer_id=rank,
        memory_len=model_config["memory_len"],
        max_seq_length=args.max_seq_length,
        random_seed=args.seed,
        no_entity_id=no_entity_id)
    eval_ds_iter = SequenceLabelingIterator(
        eval_ds,
        args.batch_size,
        tokenizer,
        trainer_num,
        trainer_id=rank,
        memory_len=model_config["memory_len"],
        max_seq_length=args.max_seq_length,
        mode="eval",
        no_entity_id=no_entity_id)
    test_ds_iter = SequenceLabelingIterator(
        test_ds,
        args.batch_size,
        tokenizer,
        trainer_num,
        trainer_id=rank,
        memory_len=model_config["memory_len"],
        max_seq_length=args.max_seq_length,
        mode="test",
        no_entity_id=no_entity_id)

    train_dataloader = paddle.io.DataLoader.from_generator(capacity=70,
                                                           return_list=True)
    train_dataloader.set_batch_generator(train_ds_iter, paddle.get_device())
    eval_dataloader = paddle.io.DataLoader.from_generator(capacity=70,
                                                          return_list=True)
    eval_dataloader.set_batch_generator(eval_ds_iter, paddle.get_device())
    test_dataloader = paddle.io.DataLoader.from_generator(capacity=70,
                                                          return_list=True)
    test_dataloader.set_batch_generator(test_ds_iter, paddle.get_device())

    num_training_examples = train_ds_iter.get_num_examples()
    num_training_steps = args.epochs * num_training_examples // args.batch_size // trainer_num
    logger.info("Device count: %d, trainer_id: %d" % (trainer_num, rank))
    logger.info("Num train examples: %d" % num_training_examples)
    logger.info("Max train steps: %d" % num_training_steps)
    logger.info("Num warmup steps: %d" %
                int(num_training_steps * args.warmup_proportion))

    lr_scheduler = LinearDecayWithWarmup(args.learning_rate,
                                         num_training_steps,
                                         args.warmup_proportion)

    # Generate parameter names needed to perform weight decay.
    # All bias and LayerNorm parameters are excluded.
    decay_params = [
        p.name for n, p in model.named_parameters()
        if not any(nd in n for nd in ["bias", "norm"])
    ]
    # Construct dict
    name_dict = dict()
    for n, p in model.named_parameters():
        name_dict[p.name] = n

    optimizer = AdamWDL(learning_rate=lr_scheduler,
                        parameters=model.parameters(),
                        weight_decay=args.weight_decay,
                        apply_decay_param_fun=lambda x: x in decay_params,
                        n_layers=model_config["num_hidden_layers"],
                        layerwise_decay=args.layerwise_decay,
                        name_dict=name_dict)

    criterion = paddle.nn.loss.CrossEntropyLoss()
    metric = ChunkEvaluator(label_list=train_ds.label_list)

    global_steps = 0

    create_memory = partial(init_memory, args.batch_size, args.memory_length,
                            model_config["hidden_size"],
                            model_config["num_hidden_layers"])
    # Copy the memory
    memories = create_memory()
    tic_train = time.time()
    best_f1 = 0
    for epoch in range(args.epochs):
        train_ds_iter.shuffle_sample()
        train_dataloader.set_batch_generator(train_ds_iter,
                                             paddle.get_device())
        for step, batch in enumerate(train_dataloader, start=1):
            global_steps += 1
            input_ids, position_ids, token_type_ids, attn_mask, labels, lengths, qids, \
                gather_idx, need_cal_loss = batch
            logits, memories = model(input_ids, memories, token_type_ids,
                                     position_ids, attn_mask)
            logits, labels = list(
                map(lambda x: paddle.gather(x, gather_idx), [logits, labels]))

            loss = criterion(logits, labels) * need_cal_loss
            loss.backward()
            optimizer.step()
            lr_scheduler.step()
            optimizer.clear_grad()

            if global_steps % args.logging_steps == 0:
                logger.info(
                    "train: global step %d, epoch: %d, loss: %f, lr: %f, speed: %.2f step/s"
                    % (global_steps, epoch, loss, lr_scheduler.get_lr(),
                       args.logging_steps / (time.time() - tic_train)))
                tic_train = time.time()
            if global_steps % args.save_steps == 0:
                # Evaluate
                logger.info("Eval:")
                precision, recall, f1_score = evaluate(model, metric,
                                                       eval_dataloader,
                                                       create_memory())
                # Save
                if rank == 0:
                    output_dir = os.path.join(args.output_dir,
                                              "model_%d" % (global_steps))
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                    model_to_save = model._layers if isinstance(
                        model, paddle.DataParallel) else model
                    model_to_save.save_pretrained(output_dir)
                    tokenizer.save_pretrained(output_dir)
                    if f1_score > best_f1:
                        logger.info("Save best model......")
                        best_f1 = f1_score
                        best_model_dir = os.path.join(args.output_dir,
                                                      "best_model")
                        if not os.path.exists(best_model_dir):
                            os.makedirs(best_model_dir)
                        model_to_save.save_pretrained(best_model_dir)
                        tokenizer.save_pretrained(best_model_dir)

            if args.max_steps > 0 and global_steps >= args.max_steps:
                return

    logger.info("Final test result:")
    eval_acc = evaluate(model, metric, test_dataloader, create_memory())
示例#14
0
def do_train():
    paddle.set_device(args.device)
    set_seed(args.seed)

    train_ds, test_ds = load_dataset('msra_ner',
                                     splits=('train', 'test'),
                                     lazy=False)
    model = FasterErnieForTokenClassification.from_pretrained(
        "ernie-1.0",
        num_classes=len(train_ds.label_list),
        max_seq_len=args.max_seq_length,
        is_split_into_words=True)

    # ['B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'O']
    label_num = len(train_ds.label_list)
    # the label 'O'  index
    no_entity_id = label_num - 1
    # ignore_label is for the label padding
    ignore_label = -100
    trans_func = partial(batchify_fn,
                         no_entity_id=no_entity_id,
                         ignore_label=ignore_label,
                         max_seq_len=args.max_seq_length)
    train_data_loader = DataLoader(dataset=train_ds,
                                   batch_size=args.batch_size,
                                   shuffle=True,
                                   collate_fn=trans_func,
                                   return_list=True)
    test_data_loader = DataLoader(dataset=test_ds,
                                  batch_size=args.batch_size,
                                  collate_fn=trans_func,
                                  return_list=True)

    if args.init_from_ckpt and os.path.isfile(args.init_from_ckpt):
        state_dict = paddle.load(args.init_from_ckpt)
        model.set_dict(state_dict)

    num_training_steps = len(train_data_loader) * args.epochs
    lr_scheduler = LinearDecayWithWarmup(args.learning_rate,
                                         num_training_steps,
                                         args.warmup_proportion)
    # Generate parameter names needed to perform weight decay.
    # All bias and LayerNorm parameters are excluded.
    decay_params = [
        p.name for n, p in model.named_parameters()
        if not any(nd in n for nd in ["bias", "norm"])
    ]
    optimizer = paddle.optimizer.AdamW(
        learning_rate=lr_scheduler,
        parameters=model.parameters(),
        weight_decay=args.weight_decay,
        apply_decay_param_fun=lambda x: x in decay_params)

    criterion = paddle.nn.loss.CrossEntropyLoss(ignore_index=ignore_label)

    metric = ChunkEvaluator(label_list=train_ds.label_list)
    if args.use_amp:
        scaler = paddle.amp.GradScaler(init_loss_scaling=args.scale_loss)

    global_step = 0
    tic_train = time.time()
    for epoch in range(args.epochs):
        for step, (texts, labels, seq_lens) in enumerate(train_data_loader,
                                                         start=1):
            texts = to_tensor(texts)
            global_step += 1
            with paddle.amp.auto_cast(
                    args.use_amp,
                    custom_white_list=["fused_feedforward",
                                       "fused_attention"]):
                logits, preds = model(texts)
                loss = criterion(logits, labels)
            avg_loss = paddle.mean(loss)
            if global_step % 10 == 0:
                print(
                    "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s"
                    % (global_step, epoch, step, avg_loss, 10 /
                       (time.time() - tic_train)))
                tic_train = time.time()
            if args.use_amp:
                scaler.scale(avg_loss).backward()
                scaler.minimize(optimizer, avg_loss)
            else:
                avg_loss.backward()
                optimizer.step()
            lr_scheduler.step()
            optimizer.clear_grad()
            if global_step % 500 == 0 or global_step == num_training_steps:
                save_dir = os.path.join(args.save_dir,
                                        "model_%d" % global_step)
                if not os.path.exists(save_dir):
                    os.makedirs(save_dir)
                evaluate(model, criterion, metric, test_data_loader, label_num)
                model.save_pretrained(save_dir)
示例#15
0
class Electra(nn.Layer):
    """
    Electra model
    """
    def __init__(
        self,
        task: str = None,
        load_checkpoint: str = None,
        label_map: Dict = None,
        num_classes: int = 2,
        **kwargs,
    ):
        super(Electra, self).__init__()
        if label_map:
            self.label_map = label_map
            self.num_classes = len(label_map)
        else:
            self.num_classes = num_classes

        if task == 'sequence_classification':
            task = 'seq-cls'
            logger.warning(
                "current task name 'sequence_classification' was renamed to 'seq-cls', "
                "'sequence_classification' has been deprecated and will be removed in the future.",
            )
        if task == 'seq-cls':
            self.model = ElectraForSequenceClassification.from_pretrained(
                pretrained_model_name_or_path='electra-small',
                num_classes=self.num_classes,
                **kwargs)
            self.criterion = paddle.nn.loss.CrossEntropyLoss()
            self.metric = paddle.metric.Accuracy()
        elif task == 'token-cls':
            self.model = ElectraForTokenClassification.from_pretrained(
                pretrained_model_name_or_path='electra-small',
                num_classes=self.num_classes,
                **kwargs)
            self.criterion = paddle.nn.loss.CrossEntropyLoss()
            self.metric = ChunkEvaluator(label_list=[
                self.label_map[i] for i in sorted(self.label_map.keys())
            ])
        elif task == 'text-matching':
            self.model = ElectraModel.from_pretrained(
                pretrained_model_name_or_path='electra-small', **kwargs)
            self.dropout = paddle.nn.Dropout(0.1)
            self.classifier = paddle.nn.Linear(
                self.model.config['hidden_size'] * 3, 2)
            self.criterion = paddle.nn.loss.CrossEntropyLoss()
            self.metric = paddle.metric.Accuracy()
        elif task is None:
            self.model = ElectraModel.from_pretrained(
                pretrained_model_name_or_path='electra-small', **kwargs)
        else:
            raise RuntimeError(
                "Unknown task {}, task should be one in {}".format(
                    task, self._tasks_supported))

        self.task = task

        if load_checkpoint is not None and os.path.isfile(load_checkpoint):
            state_dict = paddle.load(load_checkpoint)
            self.set_state_dict(state_dict)
            logger.info('Loaded parameters from %s' %
                        os.path.abspath(load_checkpoint))

    def forward(self,
                input_ids=None,
                token_type_ids=None,
                position_ids=None,
                attention_mask=None,
                query_input_ids=None,
                query_token_type_ids=None,
                query_position_ids=None,
                query_attention_mask=None,
                title_input_ids=None,
                title_token_type_ids=None,
                title_position_ids=None,
                title_attention_mask=None,
                seq_lengths=None,
                labels=None):

        if self.task != 'text-matching':
            result = self.model(input_ids, token_type_ids, position_ids,
                                attention_mask)
        else:
            query_result = self.model(query_input_ids, query_token_type_ids,
                                      query_position_ids, query_attention_mask)
            title_result = self.model(title_input_ids, title_token_type_ids,
                                      title_position_ids, title_attention_mask)

        if self.task == 'seq-cls':
            logits = result
            probs = F.softmax(logits, axis=1)
            if labels is not None:
                loss = self.criterion(logits, labels)
                correct = self.metric.compute(probs, labels)
                acc = self.metric.update(correct)
                return probs, loss, {'acc': acc}
            return probs
        elif self.task == 'token-cls':
            logits = result
            token_level_probs = F.softmax(logits, axis=-1)
            preds = token_level_probs.argmax(axis=-1)
            if labels is not None:
                loss = self.criterion(logits, labels.unsqueeze(-1))
                num_infer_chunks, num_label_chunks, num_correct_chunks = \
                    self.metric.compute(None, seq_lengths, preds, labels)
                self.metric.update(num_infer_chunks.numpy(),
                                   num_label_chunks.numpy(),
                                   num_correct_chunks.numpy())
                _, _, f1_score = map(float, self.metric.accumulate())
                return token_level_probs, loss, {'f1_score': f1_score}
            return token_level_probs
        elif self.task == 'text-matching':
            query_token_embedding = query_result
            query_token_embedding = self.dropout(query_token_embedding)
            query_attention_mask = paddle.unsqueeze(
                (query_input_ids != self.model.pad_token_id).astype(
                    query_token_embedding.dtype),
                axis=2)
            query_token_embedding = query_token_embedding * query_attention_mask
            query_sum_embedding = paddle.sum(query_token_embedding, axis=1)
            query_sum_mask = paddle.sum(query_attention_mask, axis=1)
            query_mean = query_sum_embedding / query_sum_mask

            title_token_embedding = title_result
            title_token_embedding = self.dropout(title_token_embedding)
            title_attention_mask = paddle.unsqueeze(
                (title_input_ids != self.model.pad_token_id).astype(
                    title_token_embedding.dtype),
                axis=2)
            title_token_embedding = title_token_embedding * title_attention_mask
            title_sum_embedding = paddle.sum(title_token_embedding, axis=1)
            title_sum_mask = paddle.sum(title_attention_mask, axis=1)
            title_mean = title_sum_embedding / title_sum_mask

            sub = paddle.abs(paddle.subtract(query_mean, title_mean))
            projection = paddle.concat([query_mean, title_mean, sub], axis=-1)
            logits = self.classifier(projection)
            probs = F.softmax(logits)
            if labels is not None:
                loss = self.criterion(logits, labels)
                correct = self.metric.compute(probs, labels)
                acc = self.metric.update(correct)
                return probs, loss, {'acc': acc}
            return probs
        else:
            sequence_output, pooled_output = result
            return sequence_output, pooled_output

    @staticmethod
    def get_tokenizer(*args, **kwargs):
        """
        Gets the tokenizer that is customized for this module.
        """
        return ElectraTokenizer.from_pretrained(
            pretrained_model_name_or_path='electra-small', *args, **kwargs)
示例#16
0
def do_train(args):
    paddle.set_device("gpu" if args.n_gpu else "cpu")
    if paddle.distributed.get_world_size() > 1:
        paddle.distributed.init_parallel_env()

    train_dataset, dev_dataset = ppnlp.datasets.MSRA_NER.get_datasets(
        ["train", "dev"])
    tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path)

    label_list = train_dataset.get_labels()
    label_num = len(label_list)
    no_entity_id = label_num - 1
    trans_func = partial(convert_example,
                         tokenizer=tokenizer,
                         label_list=label_list,
                         no_entity_id=label_num - 1,
                         max_seq_length=args.max_seq_length)
    train_dataset = train_dataset.apply(trans_func, lazy=True)
    train_batch_sampler = paddle.io.DistributedBatchSampler(
        train_dataset,
        batch_size=args.batch_size,
        shuffle=True,
        drop_last=True)

    ignore_label = -100
    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token]),  # input
        Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token]),  # segment
        Stack(),  # length
        Pad(axis=0, pad_val=ignore_label)  # label
    ): fn(samples)
    train_data_loader = DataLoader(dataset=train_dataset,
                                   batch_sampler=train_batch_sampler,
                                   collate_fn=batchify_fn,
                                   num_workers=0,
                                   return_list=True)

    dev_dataset = dev_dataset.apply(trans_func, lazy=True)
    dev_batch_sampler = paddle.io.BatchSampler(dev_dataset,
                                               batch_size=args.batch_size,
                                               shuffle=False,
                                               drop_last=True)
    dev_data_loader = DataLoader(dataset=dev_dataset,
                                 batch_sampler=dev_batch_sampler,
                                 collate_fn=batchify_fn,
                                 num_workers=0,
                                 return_list=True)

    model = BertForTokenClassification.from_pretrained(args.model_name_or_path,
                                                       num_classes=label_num)
    if paddle.distributed.get_world_size() > 1:
        model = paddle.DataParallel(model)

    lr_scheduler = paddle.optimizer.lr.LambdaDecay(
        args.learning_rate,
        lambda current_step, num_warmup_steps=args.warmup_steps,
        num_training_steps=args.max_steps if args.max_steps > 0 else
        (len(train_data_loader) * args.num_train_epochs): float(
            current_step) / float(max(1, num_warmup_steps))
        if current_step < num_warmup_steps else max(
            0.0,
            float(num_training_steps - current_step) / float(
                max(1, num_training_steps - num_warmup_steps))))

    optimizer = paddle.optimizer.AdamW(
        learning_rate=lr_scheduler,
        epsilon=args.adam_epsilon,
        parameters=model.parameters(),
        weight_decay=args.weight_decay,
        apply_decay_param_fun=lambda x: x in [
            p.name for n, p in model.named_parameters()
            if not any(nd in n for nd in ["bias", "norm"])
        ])

    loss_fct = paddle.nn.loss.CrossEntropyLoss(ignore_index=ignore_label)
    metric = ChunkEvaluator(int(math.ceil((label_num + 1) / 2.0)), "IOB")

    global_step = 0
    tic_train = time.time()
    for epoch in range(args.num_train_epochs):
        for step, batch in enumerate(train_data_loader):
            input_ids, segment_ids, length, labels = batch
            logits = model(input_ids, segment_ids)
            loss = loss_fct(logits.reshape([-1, label_num]),
                            labels.reshape([-1]))
            avg_loss = paddle.mean(loss)
            if global_step % args.logging_steps == 0:
                print(
                    "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s"
                    % (global_step, epoch, step, avg_loss, args.logging_steps /
                       (time.time() - tic_train)))
                tic_train = time.time()
            avg_loss.backward()
            optimizer.step()
            lr_scheduler.step()
            optimizer.clear_gradients()
            if global_step % args.save_steps == 0:
                evaluate(model, loss_fct, metric, dev_data_loader, label_num)
                if (not args.n_gpu > 1) or paddle.distributed.get_rank() == 0:
                    paddle.save(
                        model.state_dict(),
                        os.path.join(args.output_dir,
                                     "model_%d.pdparams" % global_step))
            global_step += 1
示例#17
0
        collate_fn=batchify_fn)

    test_loader = paddle.io.DataLoader(
        dataset=test_ds,
        batch_size=200,
        drop_last=True,
        return_list=True,
        collate_fn=batchify_fn)

    network = BiGRUWithCRF(300, 300, train_ds.word_num, train_ds.label_num)
    model = paddle.Model(network)

    optimizer = paddle.optimizer.Adam(
        learning_rate=0.001, parameters=model.parameters())
    crf_loss = LinearChainCrfLoss(network.crf)
    chunk_evaluator = ChunkEvaluator(
        label_list=train_ds.label_vocab.keys(), suffix=True)
    model.prepare(optimizer, crf_loss, chunk_evaluator)

    model.fit(train_data=train_loader,
              eval_data=dev_loader,
              epochs=10,
              save_dir='./results',
              log_freq=1)

    model.evaluate(eval_data=test_loader)
    outputs, lens, decodes = model.predict(test_data=test_loader)
    preds = parse_decodes(test_ds, decodes, lens)

    print('\n'.join(preds[:10]))
示例#18
0
def do_eval(args):
    paddle.set_device(args.device)

    # Create dataset, tokenizer and dataloader.
    train_ds, eval_ds = load_dataset('msra_ner', split=('train', 'test'))
    tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path)

    label_list = train_ds.features['ner_tags'].feature.names
    label_num = len(label_list)
    no_entity_id = 0

    def tokenize_and_align_labels(examples):
        tokenized_inputs = tokenizer(
            examples['tokens'],
            max_seq_len=args.max_seq_length,
            # We use this argument because the texts in our dataset are lists of words (with a label for each word).
            is_split_into_words=True,
            return_length=True)
        labels = []

        for i, label in enumerate(examples['ner_tags']):
            label_ids = label
            if len(tokenized_inputs['input_ids'][i]) - 2 < len(label_ids):
                label_ids = label_ids[:len(tokenized_inputs['input_ids'][i]) -
                                      2]
            label_ids = [no_entity_id] + label_ids + [no_entity_id]
            label_ids += [no_entity_id] * (
                len(tokenized_inputs['input_ids'][i]) - len(label_ids))

            labels.append(label_ids)
        tokenized_inputs["labels"] = labels
        return tokenized_inputs

    ignore_label = -100
    batchify_fn = lambda samples, fn=Dict({
        'input_ids':
        Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype='int32'),  # input
        'token_type_ids':
        Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype='int32'
            ),  # segment
        'seq_len':
        Stack(dtype='int64'),
        'labels':
        Pad(axis=0, pad_val=ignore_label, dtype='int64')  # label
    }): fn(samples)

    eval_ds = eval_ds.select(range(len(eval_ds) - 1))
    eval_ds = eval_ds.map(tokenize_and_align_labels, batched=True)
    eval_data_loader = DataLoader(dataset=eval_ds,
                                  collate_fn=batchify_fn,
                                  num_workers=0,
                                  batch_size=args.batch_size,
                                  return_list=True)

    # Define the model netword and its loss
    model = BertForTokenClassification.from_pretrained(args.model_name_or_path,
                                                       num_classes=label_num)
    if args.init_checkpoint_path:
        model_dict = paddle.load(args.init_checkpoint_path)
        model.set_dict(model_dict)
    loss_fct = paddle.nn.loss.CrossEntropyLoss(ignore_index=ignore_label)

    metric = ChunkEvaluator(label_list=label_list)

    model.eval()
    metric.reset()
    for step, batch in enumerate(eval_data_loader):
        input_ids, token_type_ids, length, labels = batch
        logits = model(input_ids, token_type_ids)
        loss = loss_fct(logits, labels)
        avg_loss = paddle.mean(loss)
        preds = logits.argmax(axis=2)
        num_infer_chunks, num_label_chunks, num_correct_chunks = metric.compute(
            length, preds, labels)
        metric.update(num_infer_chunks.numpy(), num_label_chunks.numpy(),
                      num_correct_chunks.numpy())
        precision, recall, f1_score = metric.accumulate()
    print("eval loss: %f, precision: %f, recall: %f, f1: %f" %
          (avg_loss, precision, recall, f1_score))
示例#19
0
def train(args):
    paddle.set_device(args.device)

    # Create dataset.
    train_ds, test_ds = load_dataset(
        datafiles=(os.path.join(args.data_dir, 'train.tsv'),
                   os.path.join(args.data_dir, 'test.tsv')))

    word_vocab = load_vocab(os.path.join(args.data_dir, 'word.dic'))
    label_vocab = load_vocab(os.path.join(args.data_dir, 'tag.dic'))
    # q2b.dic is used to replace DBC case to SBC case
    normlize_vocab = load_vocab(os.path.join(args.data_dir, 'q2b.dic'))

    trans_func = partial(convert_example,
                         max_seq_len=args.max_seq_len,
                         word_vocab=word_vocab,
                         label_vocab=label_vocab,
                         normlize_vocab=normlize_vocab)
    train_ds.map(trans_func)
    test_ds.map(trans_func)

    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=0, dtype='int64'),  # word_ids
        Stack(dtype='int64'),  # length
        Pad(axis=0, pad_val=0, dtype='int64'),  # label_ids
    ): fn(samples)

    # Create sampler for dataloader
    train_sampler = paddle.io.DistributedBatchSampler(
        dataset=train_ds,
        batch_size=args.batch_size,
        shuffle=True,
        drop_last=True)
    train_loader = paddle.io.DataLoader(dataset=train_ds,
                                        batch_sampler=train_sampler,
                                        return_list=True,
                                        collate_fn=batchify_fn)

    test_sampler = paddle.io.BatchSampler(dataset=test_ds,
                                          batch_size=args.batch_size,
                                          shuffle=False,
                                          drop_last=False)
    test_loader = paddle.io.DataLoader(dataset=test_ds,
                                       batch_sampler=test_sampler,
                                       return_list=True,
                                       collate_fn=batchify_fn)

    # Define the model netword and its loss
    model = BiGruCrf(args.emb_dim, args.hidden_size, len(word_vocab),
                     len(label_vocab))
    # Prepare optimizer, loss and metric evaluator
    optimizer = paddle.optimizer.Adam(learning_rate=args.base_lr,
                                      parameters=model.parameters())
    chunk_evaluator = ChunkEvaluator(label_list=label_vocab.keys(),
                                     suffix=True)

    if args.init_checkpoint:
        model_dict = paddle.load(args.init_checkpoint)
        model.load_dict(model_dict)

    # Start training
    global_step = 0
    last_step = args.epochs * len(train_loader)
    tic_train = time.time()
    for epoch in range(args.epochs):
        for step, batch in enumerate(train_loader):
            global_step += 1
            token_ids, length, label_ids = batch
            loss = model(token_ids, length, label_ids)
            avg_loss = paddle.mean(loss)
            if global_step % args.logging_steps == 0:
                print("global step %d / %d, loss: %f, speed: %.2f step/s" %
                      (global_step, last_step, avg_loss, args.logging_steps /
                       (time.time() - tic_train)))
                tic_train = time.time()
            avg_loss.backward()
            optimizer.step()
            optimizer.clear_grad()
            if global_step % args.save_steps == 0 or global_step == last_step:
                if paddle.distributed.get_rank() == 0:
                    evaluate(model, chunk_evaluator, test_loader)
                    paddle.save(
                        model.state_dict(),
                        os.path.join(args.model_save_dir,
                                     "model_%d.pdparams" % global_step))
示例#20
0
文件: train.py 项目: dot23/PaddleNLP
def do_train(args):
    paddle.set_device("gpu" if args.n_gpu else "cpu")
    if paddle.distributed.get_world_size() > 1:
        paddle.distributed.init_parallel_env()

    train_ds, test_ds = load_dataset('msra_ner',
                                     splits=('train', 'test'),
                                     lazy=False)

    tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path)

    label_list = train_ds.label_list
    label_num = len(label_list)
    no_entity_id = label_num - 1

    trans_func = partial(tokenize_and_align_labels,
                         tokenizer=tokenizer,
                         no_entity_id=no_entity_id,
                         max_seq_len=args.max_seq_length)

    train_ds = train_ds.map(trans_func)

    ignore_label = -100

    batchify_fn = lambda samples, fn=Dict(
        {
            'input_ids': Pad(axis=0, pad_val=tokenizer.pad_token_id),  # input
            'token_type_ids': Pad(axis=0, pad_val=tokenizer.pad_token_type_id
                                  ),  # segment
            'seq_len': Stack(),  # seq_len
            'labels': Pad(axis=0, pad_val=ignore_label)  # label
        }): fn(samples)

    train_batch_sampler = paddle.io.DistributedBatchSampler(
        train_ds, batch_size=args.batch_size, shuffle=True, drop_last=True)

    train_data_loader = DataLoader(dataset=train_ds,
                                   collate_fn=batchify_fn,
                                   num_workers=0,
                                   batch_sampler=train_batch_sampler,
                                   return_list=True)

    test_ds = test_ds.map(trans_func)

    test_data_loader = DataLoader(dataset=test_ds,
                                  collate_fn=batchify_fn,
                                  num_workers=0,
                                  batch_size=args.batch_size,
                                  return_list=True)

    model = BertForTokenClassification.from_pretrained(args.model_name_or_path,
                                                       num_classes=label_num)
    if paddle.distributed.get_world_size() > 1:
        model = paddle.DataParallel(model)

    num_training_steps = args.max_steps if args.max_steps > 0 else len(
        train_data_loader) * args.num_train_epochs

    lr_scheduler = LinearDecayWithWarmup(args.learning_rate,
                                         num_training_steps, args.warmup_steps)

    optimizer = paddle.optimizer.AdamW(
        learning_rate=lr_scheduler,
        epsilon=args.adam_epsilon,
        parameters=model.parameters(),
        weight_decay=args.weight_decay,
        apply_decay_param_fun=lambda x: x in [
            p.name for n, p in model.named_parameters()
            if not any(nd in n for nd in ["bias", "norm"])
        ])

    loss_fct = paddle.nn.loss.CrossEntropyLoss(ignore_index=ignore_label)

    metric = ChunkEvaluator(label_list=label_list)

    global_step = 0
    last_step = args.num_train_epochs * len(train_data_loader)
    tic_train = time.time()
    for epoch in range(args.num_train_epochs):
        for step, batch in enumerate(train_data_loader):
            global_step += 1
            input_ids, token_type_ids, _, labels = batch
            logits = model(input_ids, token_type_ids)
            loss = loss_fct(logits.reshape([-1, label_num]),
                            labels.reshape([-1]))
            avg_loss = paddle.mean(loss)
            if global_step % args.logging_steps == 0:
                print(
                    "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s"
                    % (global_step, epoch, step, avg_loss, args.logging_steps /
                       (time.time() - tic_train)))
                tic_train = time.time()
            avg_loss.backward()
            optimizer.step()
            lr_scheduler.step()
            optimizer.clear_grad()
            if global_step % args.save_steps == 0 or global_step == last_step:
                if (not args.n_gpu > 1) or paddle.distributed.get_rank() == 0:
                    evaluate(model, loss_fct, metric, test_data_loader,
                             label_num)
                    paddle.save(
                        model.state_dict(),
                        os.path.join(args.output_dir,
                                     "model_%d.pdparams" % global_step))
示例#21
0
        model.set_dict(state_dict)
    model = paddle.DataParallel(model)

    num_training_steps = len(train_data_loader) * args.epochs
    # Generate parameter names needed to perform weight decay.
    # All bias and LayerNorm parameters are excluded.
    decay_params = [
        p.name for n, p in model.named_parameters()
        if not any(nd in n for nd in ["bias", "norm"])
    ]
    optimizer = paddle.optimizer.AdamW(
        learning_rate=args.learning_rate,
        parameters=model.parameters(),
        weight_decay=args.weight_decay,
        apply_decay_param_fun=lambda x: x in decay_params)
    metric = ChunkEvaluator(label_list=train_ds.label_list, suffix=True)

    global_step = 0
    tic_train = time.time()
    for epoch in range(1, args.epochs + 1):
        for step, batch in enumerate(train_data_loader, start=1):
            input_ids, token_type_ids, seq_lens, labels = batch
            loss = model(
                input_ids, token_type_ids, seq_lens=seq_lens, labels=labels)
            avg_loss = paddle.mean(loss)
            global_step += 1
            if global_step % 10 == 0 and rank == 0:
                print(
                    "global step %d, epoch: %d, batch: %d, loss: %.5f, speed: %.2f step/s"
                    % (global_step, epoch, step, avg_loss,
                       10 / (time.time() - tic_train)))
示例#22
0
    def predict(self,
                dataset,
                tokenizer,
                batchify_fn,
                args,
                dev_example=None,
                dev_ds_ori=None):
        if args.collect_shape:
            self.set_dynamic_shape(args.max_seq_length, args.batch_size)
        if args.task_name == "cmrc2018":
            dataset_removed = dataset.remove_columns(
                ["offset_mapping", "attention_mask", "example_id"])
            sample_num = len(dataset)
            batches = []
            for i in range(0, sample_num, args.batch_size):
                batch_size = min(args.batch_size, sample_num - i)
                batch = [dataset_removed[i + j] for j in range(batch_size)]
                batches.append(batch)
        else:
            sample_num = len(dataset)
            batches = []
            for i in range(0, sample_num, args.batch_size):
                batch_size = min(args.batch_size, sample_num - i)
                batch = [dataset[i + j] for j in range(batch_size)]
                batches.append(batch)
        if args.perf:
            for i, batch in enumerate(batches):
                batch = batchify_fn(batch)
                input_ids, segment_ids = batch["input_ids"].numpy(
                ), batch["token_type_ids"].numpy()
                output = self.predict_batch([input_ids, segment_ids])
                if i > args.perf_warmup_steps:
                    break
            time1 = time.time()
            nums = 0
            for batch in batches:
                batch = batchify_fn(batch)
                input_ids, segment_ids = batch["input_ids"].numpy(
                ), batch["token_type_ids"].numpy()
                nums = nums + input_ids.shape[0]
                output = self.predict_batch([input_ids, segment_ids])
            total_time = time.time() - time1
            print("task name: %s, sample nums: %s, time: %s, QPS: %s " %
                  (args.task_name, nums, total_time, nums / total_time))

        else:
            if args.task_name == "msra_ner":
                metric = ChunkEvaluator(label_list=args.label_list)
                metric.reset()
                all_predictions = []
                batch_num = len(dataset['input_ids'])
                for batch in batches:
                    batch = batchify_fn(batch)
                    input_ids, segment_ids = batch["input_ids"].numpy(
                    ), batch["token_type_ids"].numpy()
                    output = self.predict_batch([input_ids, segment_ids])[0]
                    preds = np.argmax(output, axis=2)
                    all_predictions.append(preds.tolist())
                    num_infer_chunks, num_label_chunks, num_correct_chunks = metric.compute(
                        batch["seq_len"], paddle.to_tensor(preds),
                        batch["labels"])
                    metric.update(num_infer_chunks.numpy(),
                                  num_label_chunks.numpy(),
                                  num_correct_chunks.numpy())
                res = metric.accumulate()
                print("task name: %s, (precision, recall, f1): %s, " %
                      (args.task_name, res))
            elif args.task_name == "cmrc2018":
                all_start_logits = []
                all_end_logits = []
                for batch in batches:
                    batch = batchify_fn(batch)
                    input_ids, segment_ids = batch["input_ids"].numpy(
                    ), batch["token_type_ids"].numpy()
                    start_logits, end_logits = self.predict_batch(
                        [input_ids, segment_ids])
                    for idx in range(start_logits.shape[0]):
                        if len(all_start_logits) % 1000 == 0 and len(
                                all_start_logits):
                            print("Processing example: %d" %
                                  len(all_start_logits))
                        all_start_logits.append(start_logits[idx])
                        all_end_logits.append(end_logits[idx])
                all_predictions, _, _ = compute_prediction(
                    dev_example, dataset, (all_start_logits, all_end_logits),
                    False, args.n_best_size, args.max_answer_length)
                res = squad_evaluate(
                    examples=[raw_data for raw_data in dev_example],
                    preds=all_predictions,
                    is_whitespace_splited=False)
                print("task name: %s, EM: %s, F1: %s" %
                      (args.task_name, res['exact'], res['f1']))
                return all_predictions
            else:
                all_predictions = []
                metric = METRIC_CLASSES[args.task_name]()
                metric.reset()
                for i, batch in enumerate(batches):
                    batch = batchify_fn(batch)
                    output = self.predict_batch([
                        batch["input_ids"].numpy(),
                        batch["token_type_ids"].numpy()
                    ])[0]
                    preds = np.argmax(output, axis=1)
                    all_predictions.append(preds.tolist())
                    correct = metric.compute(paddle.to_tensor(output),
                                             batch["labels"])
                    metric.update(correct)
                res = metric.accumulate()

                print("task name: %s, acc: %s, " % (args.task_name, res))
                return all_predictions
示例#23
0
                                        collate_fn=batchify_fn)
    dev_loader = paddle.io.DataLoader(dataset=dev_ds,
                                      batch_size=args.batch_size,
                                      return_list=True,
                                      collate_fn=batchify_fn)
    test_loader = paddle.io.DataLoader(dataset=test_ds,
                                       batch_size=args.batch_size,
                                       return_list=True,
                                       collate_fn=batchify_fn)

    # Define the model netword and its loss
    ernie = ErnieForTokenClassification.from_pretrained(
        "ernie-1.0", num_classes=len(label_vocab))
    model = ErnieCrfForTokenClassification(ernie)

    metric = ChunkEvaluator(label_list=label_vocab.keys(), suffix=True)
    optimizer = paddle.optimizer.AdamW(learning_rate=2e-5,
                                       parameters=model.parameters())

    step = 0
    for epoch in range(args.epochs):
        for input_ids, token_type_ids, lengths, labels in train_loader:
            loss = model(input_ids,
                         token_type_ids,
                         lengths=lengths,
                         labels=labels)
            avg_loss = paddle.mean(loss)
            avg_loss.backward()
            optimizer.step()
            optimizer.clear_grad()
            step += 1
示例#24
0
def do_train(args):
    paddle.set_device(args.device)
    if paddle.distributed.get_world_size() > 1:
        paddle.distributed.init_parallel_env()

    # Create dataset, tokenizer and dataloader.
    if args.dataset == "peoples_daily_ner":
        raw_datasets = load_dataset(args.dataset)
    else:
        raw_datasets = load_dataset(args.dataset)

    AutoForTokenClassification, AutoTokenizer = MODEL_CLASSES[args.model_type]
    tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
    train_ds = raw_datasets['train']

    label_list = train_ds.features['ner_tags'].feature.names
    label_num = len(label_list)
    no_entity_id = 0

    def tokenize_and_align_labels(examples):
        tokenized_inputs = tokenizer(
            examples['tokens'],
            max_seq_len=args.max_seq_length,
            # We use this argument because the texts in our dataset are lists of words (with a label for each word).
            is_split_into_words=True,
            return_length=True)
        labels = []

        for i, label in enumerate(examples['ner_tags']):
            label_ids = label
            if len(tokenized_inputs['input_ids'][i]) - 2 < len(label_ids):
                label_ids = label_ids[:len(tokenized_inputs['input_ids'][i]) -
                                      2]
            label_ids = [no_entity_id] + label_ids + [no_entity_id]
            label_ids += [no_entity_id] * (
                len(tokenized_inputs['input_ids'][i]) - len(label_ids))

            labels.append(label_ids)
        tokenized_inputs["labels"] = labels
        return tokenized_inputs

    train_ds = train_ds.select(range(len(train_ds) - 1))
    column_names = train_ds.column_names
    train_ds = train_ds.map(tokenize_and_align_labels,
                            batched=True,
                            remove_columns=column_names)

    ignore_label = -100

    batchify_fn = DataCollatorForTokenClassification(tokenizer, ignore_label)

    train_batch_sampler = paddle.io.DistributedBatchSampler(
        train_ds, batch_size=args.batch_size, shuffle=True, drop_last=True)

    train_data_loader = DataLoader(dataset=train_ds,
                                   collate_fn=batchify_fn,
                                   num_workers=0,
                                   batch_sampler=train_batch_sampler,
                                   return_list=True)

    test_ds = raw_datasets['test']
    test_ds = test_ds.select(range(len(test_ds) - 1))
    test_ds = test_ds.map(tokenize_and_align_labels,
                          batched=True,
                          remove_columns=column_names)

    test_data_loader = DataLoader(dataset=test_ds,
                                  collate_fn=batchify_fn,
                                  num_workers=0,
                                  batch_size=args.batch_size,
                                  return_list=True)

    if args.dataset == "peoples_daily_ner":
        dev_ds = raw_datasets['validation']
        dev_ds = dev_ds.select(range(len(dev_ds) - 1))
        dev_ds = dev_ds.map(tokenize_and_align_labels,
                            batched=True,
                            remove_columns=column_names)

        dev_data_loader = DataLoader(dataset=dev_ds,
                                     collate_fn=batchify_fn,
                                     num_workers=0,
                                     batch_size=args.batch_size,
                                     return_list=True)

    # Define the model netword and its loss
    model = AutoForTokenClassification.from_pretrained(args.model_name_or_path,
                                                       num_classes=label_num)

    if paddle.distributed.get_world_size() > 1:
        model = paddle.DataParallel(model)

    num_training_steps = args.max_steps if args.max_steps > 0 else len(
        train_data_loader) * args.num_train_epochs

    lr_scheduler = LinearDecayWithWarmup(args.learning_rate,
                                         num_training_steps, args.warmup_steps)

    # Generate parameter names needed to perform weight decay.
    # All bias and LayerNorm parameters are excluded.
    decay_params = [
        p.name for n, p in model.named_parameters()
        if not any(nd in n for nd in ["bias", "norm"])
    ]
    optimizer = paddle.optimizer.AdamW(
        learning_rate=lr_scheduler,
        epsilon=args.adam_epsilon,
        parameters=model.parameters(),
        weight_decay=args.weight_decay,
        apply_decay_param_fun=lambda x: x in decay_params)

    loss_fct = paddle.nn.loss.CrossEntropyLoss(ignore_index=ignore_label)

    metric = ChunkEvaluator(label_list=label_list)

    global_step = 0
    last_step = args.num_train_epochs * len(train_data_loader)
    tic_train = time.time()
    for epoch in range(args.num_train_epochs):
        for step, batch in enumerate(train_data_loader):
            global_step += 1
            logits = model(batch['input_ids'], batch['token_type_ids'])
            loss = loss_fct(logits, batch['labels'])
            avg_loss = paddle.mean(loss)
            if global_step % args.logging_steps == 0:
                print(
                    "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s"
                    % (global_step, epoch, step, avg_loss, args.logging_steps /
                       (time.time() - tic_train)))
                tic_train = time.time()
            avg_loss.backward()
            optimizer.step()
            lr_scheduler.step()
            optimizer.clear_grad()
            if global_step % args.save_steps == 0 or global_step == num_training_steps:
                if paddle.distributed.get_rank() == 0:
                    if args.dataset == "peoples_daily_ner":
                        evaluate(model, loss_fct, metric, dev_data_loader,
                                 label_num, "valid")
                    evaluate(model, loss_fct, metric, test_data_loader,
                             label_num, "test")

                    paddle.save(
                        model.state_dict(),
                        os.path.join(args.output_dir,
                                     "model_%d.pdparams" % global_step))
            if global_step >= num_training_steps:
                return
示例#25
0
def do_train():
    paddle.set_device(args.device)
    world_size = paddle.distributed.get_world_size()
    rank = paddle.distributed.get_rank()
    if world_size > 1:
        paddle.distributed.init_parallel_env()

    set_seed(args)

    no_entity_label = "O"
    ignore_label = -1

    tokenizer = ErnieTokenizer.from_pretrained("ernie-1.0")
    label_map = load_dict(args.tag_path)
    id2label = {val: key for key, val in label_map.items()}
    model = ErnieForTokenClassification.from_pretrained("ernie-1.0", num_classes=len(label_map))
    model = paddle.DataParallel(model)

    print("============start train==========")
    train_ds = DuEventExtraction(args.train_data, args.tag_path)
    dev_ds = DuEventExtraction(args.dev_data, args.tag_path)
    test_ds = DuEventExtraction(args.test_data, args.tag_path)

    trans_func = partial(
        convert_example_to_feature,
        tokenizer=tokenizer,
        label_vocab=train_ds.label_vocab,
        max_seq_len=args.max_seq_len,
        no_entity_label=no_entity_label,
        ignore_label=ignore_label,
        is_test=False)
    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token], dtype='int32'), # input ids
        Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token], dtype='int32'), # token type ids
        Stack(dtype='int64'), # sequence lens
        Pad(axis=0, pad_val=ignore_label, dtype='int64') # labels
    ): fn(list(map(trans_func, samples)))

    batch_sampler = paddle.io.DistributedBatchSampler(train_ds, batch_size=args.batch_size, shuffle=True)
    train_loader = paddle.io.DataLoader(
        dataset=train_ds,
        batch_sampler=batch_sampler,
        collate_fn=batchify_fn)
    dev_loader = paddle.io.DataLoader(
        dataset=dev_ds,
        batch_size=args.batch_size,
        collate_fn=batchify_fn)
    test_loader = paddle.io.DataLoader(
        dataset=test_ds,
        batch_size=args.batch_size,
        collate_fn=batchify_fn)

    num_training_steps = len(train_loader) * args.num_epoch
    # Generate parameter names needed to perform weight decay.
    # All bias and LayerNorm parameters are excluded.
    decay_params = [
        p.name for n, p in model.named_parameters()
        if not any(nd in n for nd in ["bias", "norm"])
    ]
    optimizer = paddle.optimizer.AdamW(
        learning_rate=args.learning_rate,
        parameters=model.parameters(),
        weight_decay=args.weight_decay,
        apply_decay_param_fun=lambda x: x in decay_params)

    metric = ChunkEvaluator(label_list=train_ds.label_vocab.keys(), suffix=False)
    criterion = paddle.nn.loss.CrossEntropyLoss(ignore_index=ignore_label)

    step, best_f1 = 0, 0.0
    model.train()
    for epoch in range(args.num_epoch):
        for idx, (input_ids, token_type_ids, seq_lens, labels) in enumerate(train_loader):
            logits = model(input_ids, token_type_ids).reshape(
                [-1, train_ds.label_num])
            loss = paddle.mean(criterion(logits, labels.reshape([-1])))
            loss.backward()
            optimizer.step()
            optimizer.clear_grad()
            loss_item = loss.numpy().item()
            if step > 0 and step % args.skip_step == 0 and rank == 0:
                print(f'train epoch: {epoch} - step: {step} (total: {num_training_steps}) - loss: {loss_item:.6f}')
            if step > 0 and step % args.valid_step == 0 and rank == 0:
                p, r, f1, avg_loss = evaluate(model, criterion, metric, len(label_map), dev_loader)
                print(f'dev step: {step} - loss: {avg_loss:.5f}, precision: {p:.5f}, recall: {r:.5f}, ' \
                        f'f1: {f1:.5f} current best {best_f1:.5f}')
                if f1 > best_f1:
                    best_f1 = f1
                    print(f'==============================================save best model ' \
                            f'best performerence {best_f1:5f}')
                    paddle.save(model.state_dict(), '{}/best.pdparams'.format(args.checkpoints))
            step += 1

    # save the final model
    if rank == 0:
        paddle.save(model.state_dict(), '{}/final.pdparams'.format(args.checkpoints))
示例#26
0
def do_train(args):
    paddle.set_device(args.device)
    if paddle.distributed.get_world_size() > 1:
        paddle.distributed.init_parallel_env()

    # Create dataset, tokenizer and dataloader.
    if args.dataset == "peoples_daily_ner":
        train_ds, dev_ds, test_ds = load_dataset(
            args.dataset, splits=('train', 'dev', 'test'), lazy=False)
    else:
        train_ds, test_ds = load_dataset(
            args.dataset, splits=('train', 'test'), lazy=False)

    AutoForTokenClassification, AutoTokenizer = MODEL_CLASSES[args.model_type]
    tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)

    label_list = train_ds.label_list
    label_num = len(label_list)
    no_entity_id = label_num - 1

    trans_func = partial(
        tokenize_and_align_labels,
        tokenizer=tokenizer,
        no_entity_id=no_entity_id,
        max_seq_len=args.max_seq_length)

    train_ds = train_ds.map(trans_func)

    ignore_label = -100

    batchify_fn = lambda samples, fn=Dict({
        'input_ids': Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype='int32'),  # input
        'token_type_ids': Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype='int32'),  # segment
        'seq_len': Stack(dtype='int64'),  # seq_len
        'labels': Pad(axis=0, pad_val=ignore_label, dtype='int64')  # label
    }): fn(samples)

    train_batch_sampler = paddle.io.DistributedBatchSampler(
        train_ds, batch_size=args.batch_size, shuffle=True, drop_last=True)

    train_data_loader = DataLoader(
        dataset=train_ds,
        collate_fn=batchify_fn,
        num_workers=0,
        batch_sampler=train_batch_sampler,
        return_list=True)

    test_ds = test_ds.map(trans_func)

    test_data_loader = DataLoader(
        dataset=test_ds,
        collate_fn=batchify_fn,
        num_workers=0,
        batch_size=args.batch_size,
        return_list=True)

    if args.dataset == "peoples_daily_ner":
        dev_ds = dev_ds.map(trans_func)

        dev_data_loader = DataLoader(
            dataset=dev_ds,
            collate_fn=batchify_fn,
            num_workers=0,
            batch_size=args.batch_size,
            return_list=True)

    # Define the model netword and its loss
    model = AutoForTokenClassification.from_pretrained(
        args.model_name_or_path, num_classes=label_num)

    if paddle.distributed.get_world_size() > 1:
        model = paddle.DataParallel(model)

    num_training_steps = args.max_steps if args.max_steps > 0 else len(
        train_data_loader) * args.num_train_epochs

    lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps,
                                         args.warmup_steps)

    # Generate parameter names needed to perform weight decay.
    # All bias and LayerNorm parameters are excluded.
    decay_params = [
        p.name for n, p in model.named_parameters()
        if not any(nd in n for nd in ["bias", "norm"])
    ]
    optimizer = paddle.optimizer.AdamW(
        learning_rate=lr_scheduler,
        epsilon=args.adam_epsilon,
        parameters=model.parameters(),
        weight_decay=args.weight_decay,
        apply_decay_param_fun=lambda x: x in decay_params)

    loss_fct = paddle.nn.loss.CrossEntropyLoss(ignore_index=ignore_label)

    metric = ChunkEvaluator(label_list=label_list)

    global_step = 0
    last_step = args.num_train_epochs * len(train_data_loader)
    tic_train = time.time()
    for epoch in range(args.num_train_epochs):
        for step, batch in enumerate(train_data_loader):
            global_step += 1
            input_ids, token_type_ids, _, labels = batch
            logits = model(input_ids, token_type_ids)
            loss = loss_fct(logits, labels)
            avg_loss = paddle.mean(loss)
            if global_step % args.logging_steps == 0:
                print(
                    "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s"
                    % (global_step, epoch, step, avg_loss,
                       args.logging_steps / (time.time() - tic_train)))
                tic_train = time.time()
            avg_loss.backward()
            optimizer.step()
            lr_scheduler.step()
            optimizer.clear_grad()
            if global_step % args.save_steps == 0 or global_step == num_training_steps:
                if paddle.distributed.get_rank() == 0:
                    if args.dataset == "peoples_daily_ner":
                        evaluate(model, loss_fct, metric, dev_data_loader,
                                 label_num, "valid")
                    evaluate(model, loss_fct, metric, test_data_loader,
                             label_num, "test")

                    paddle.save(model.state_dict(),
                                os.path.join(args.output_dir,
                                             "model_%d.pdparams" % global_step))
            if global_step >= num_training_steps:
                return
示例#27
0
def train(args):
    paddle.set_device("gpu" if args.n_gpu else "cpu")

    # create dataset.
    train_dataset = LacDataset(args.data_dir, mode='train')
    test_dataset = LacDataset(args.data_dir, mode='test')

    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=0),  # word_ids
        Stack(),  # length
        Pad(axis=0, pad_val=0),  # label_ids
    ): fn(samples)

    # Create sampler for dataloader
    train_sampler = paddle.io.DistributedBatchSampler(
        dataset=train_dataset,
        batch_size=args.batch_size,
        shuffle=True,
        drop_last=True)
    train_loader = paddle.io.DataLoader(dataset=train_dataset,
                                        batch_sampler=train_sampler,
                                        return_list=True,
                                        collate_fn=batchify_fn)

    test_sampler = paddle.io.BatchSampler(dataset=test_dataset,
                                          batch_size=args.batch_size,
                                          shuffle=False,
                                          drop_last=False)
    test_loader = paddle.io.DataLoader(dataset=test_dataset,
                                       batch_sampler=test_sampler,
                                       return_list=True,
                                       collate_fn=batchify_fn)

    # Define the model netword and its loss
    network = BiGruCrf(args.emb_dim, args.hidden_size,
                       train_dataset.vocab_size, train_dataset.num_labels)
    model = paddle.Model(network)

    # Prepare optimizer, loss and metric evaluator
    optimizer = paddle.optimizer.Adam(learning_rate=args.base_lr,
                                      parameters=model.parameters())
    crf_loss = LinearChainCrfLoss(network.crf)
    chunk_evaluator = ChunkEvaluator(
        label_list=train_dataset.label_vocab.keys(), suffix=True)
    model.prepare(optimizer, crf_loss, chunk_evaluator)
    if args.init_checkpoint:
        model.load(args.init_checkpoint)

    # Start training
    callbacks = paddle.callbacks.ProgBarLogger(
        log_freq=10, verbose=3) if args.verbose else None
    model.fit(train_data=train_loader,
              eval_data=test_loader,
              batch_size=args.batch_size,
              epochs=args.epochs,
              eval_freq=1,
              log_freq=10,
              save_dir=args.model_save_dir,
              save_freq=1,
              shuffle=True,
              callbacks=callbacks)
示例#28
0
def train(args):
    paddle.set_device(args.device)
    set_seed(102)
    trainer_num = paddle.distributed.get_world_size()
    if trainer_num > 1:
        paddle.distributed.init_parallel_env()
    rank = paddle.distributed.get_rank()

    word_vocab, label_vocab, train_loader, test_loader = create_data_loader(
        args)

    # Define the model netword and its loss
    model = BiGruCrf(args.emb_dim,
                     args.hidden_size,
                     len(word_vocab),
                     len(label_vocab),
                     crf_lr=args.crf_lr)
    # Prepare optimizer, loss and metric evaluator
    optimizer = paddle.optimizer.Adam(learning_rate=args.base_lr,
                                      parameters=model.parameters())
    chunk_evaluator = ChunkEvaluator(label_list=label_vocab.keys(),
                                     suffix=True)

    if args.init_checkpoint:
        if os.path.exists(args.init_checkpoint):
            logger.info("Init checkpoint from %s" % args.init_checkpoint)
            model_dict = paddle.load(args.init_checkpoint)
            model.load_dict(model_dict)
        else:
            logger.info("Cannot init checkpoint from %s which doesn't exist" %
                        args.init_checkpoint)
    logger.info("Start training")
    # Start training
    global_step = 0
    last_step = args.epochs * len(train_loader)
    train_reader_cost = 0.0
    train_run_cost = 0.0
    total_samples = 0
    reader_start = time.time()
    max_f1_score = -1
    for epoch in range(args.epochs):
        for step, batch in enumerate(train_loader):
            train_reader_cost += time.time() - reader_start
            global_step += 1
            token_ids, length, label_ids = batch
            train_start = time.time()
            loss = model(token_ids, length, label_ids)
            avg_loss = paddle.mean(loss)
            train_run_cost += time.time() - train_start
            total_samples += args.batch_size
            if global_step % args.logging_steps == 0:
                logger.info(
                    "global step %d / %d, loss: %f, avg_reader_cost: %.5f sec, avg_batch_cost: %.5f sec, avg_samples: %.5f, ips: %.5f sequences/sec"
                    %
                    (global_step, last_step, avg_loss,
                     train_reader_cost / args.logging_steps,
                     (train_reader_cost + train_run_cost) / args.logging_steps,
                     total_samples / args.logging_steps, total_samples /
                     (train_reader_cost + train_run_cost)))
                train_reader_cost = 0.0
                train_run_cost = 0.0
                total_samples = 0
            avg_loss.backward()
            optimizer.step()
            optimizer.clear_grad()
            if global_step % args.save_steps == 0 or global_step == last_step:
                if rank == 0:
                    paddle.save(
                        model.state_dict(),
                        os.path.join(args.model_save_dir,
                                     "model_%d.pdparams" % global_step))
                    logger.info("Save %d steps model." % (global_step))
                    if args.do_eval:
                        precision, recall, f1_score = evaluate(
                            model, chunk_evaluator, test_loader)
                        if f1_score > max_f1_score:
                            max_f1_score = f1_score
                            paddle.save(
                                model.state_dict(),
                                os.path.join(args.model_save_dir,
                                             "best_model.pdparams"))
                            logger.info("Save best model.")

            reader_start = time.time()
示例#29
0
                                      drop_last=True,
                                      return_list=True,
                                      collate_fn=batchify_fn)

    test_loader = paddle.io.DataLoader(dataset=test_ds,
                                       batch_size=200,
                                       drop_last=True,
                                       return_list=True,
                                       collate_fn=batchify_fn)

    network = BiGRUWithCRF(300, 300, train_ds.word_num, train_ds.label_num)
    model = paddle.Model(network)

    optimizer = paddle.optimizer.Adam(learning_rate=0.001,
                                      parameters=model.parameters())
    crf_loss = LinearChainCrfLoss(network.crf.transitions)
    chunk_evaluator = ChunkEvaluator((train_ds.label_num + 2) // 2, 'IOB')
    model.prepare(optimizer, crf_loss, chunk_evaluator)

    model.fit(train_data=train_loader,
              eval_data=dev_loader,
              epochs=10,
              save_dir='./results',
              log_freq=1)

    model.evaluate(eval_data=test_loader)
    outputs, lens, decodes = model.predict(test_data=test_loader)
    preds = parse_decodes(test_ds, decodes, lens)

    print('\n'.join(preds[:10]))
示例#30
0
                                        shuffle=True,
                                        return_list=True,
                                        collate_fn=batchify_fn)
    dev_loader = paddle.io.DataLoader(dataset=dev_ds,
                                      batch_size=200,
                                      return_list=True,
                                      collate_fn=batchify_fn)
    test_loader = paddle.io.DataLoader(dataset=test_ds,
                                       batch_size=200,
                                       return_list=True,
                                       collate_fn=batchify_fn)

    model = ErnieForTokenClassification.from_pretrained(
        "ernie-1.0", num_classes=train_ds.label_num)

    metric = ChunkEvaluator((train_ds.label_num + 2) // 2, "IOB")
    loss_fn = paddle.nn.loss.CrossEntropyLoss(ignore_index=ignore_label)
    optimizer = paddle.optimizer.AdamW(learning_rate=2e-5,
                                       parameters=model.parameters())

    step = 0
    for epoch in range(10):
        model.train()
        for idx, (input_ids, segment_ids, length,
                  labels) in enumerate(train_loader):
            logits = model(input_ids,
                           segment_ids).reshape([-1, train_ds.label_num])
            loss = paddle.mean(loss_fn(logits, labels.reshape([-1])))
            loss.backward()
            optimizer.step()
            optimizer.clear_gradients()