예제 #1
0
def init_roberta_var(args):
    if args.language == 'ch':
        tokenizer = RobertaTokenizer.from_pretrained(args.from_pretrained)
    else:
        tokenizer = RobertaBPETokenizer.from_pretrained(args.from_pretrained)

    model = RobertaForQuestionAnswering.from_pretrained(
        args.from_pretrained, num_classes=args.num_classes)
    map_fn = partial(map_fn_DuCheckList, args=args, tokenizer=tokenizer)
    dev_ds = RCInterpret().read(args.data_dir)

    dev_ds.map(map_fn, batched=True)
    dev_batch_sampler = paddle.io.BatchSampler(dev_ds,
                                               batch_size=args.batch_size,
                                               shuffle=False)
    batchify_fn = lambda samples, fn=Dict(
        {
            "input_ids": Pad(axis=0, pad_val=tokenizer.pad_token_id),
            "token_type_ids": Pad(axis=0, pad_val=tokenizer.pad_token_type_id),
            "offset_mapping": Pad(axis=0, pad_val=tokenizer.pad_token_id),
            "overflow_to_sample": Stack(dtype='int32'),
        }): fn(samples)

    dev_dataloader = paddle.io.DataLoader(dataset=dev_ds,
                                          batch_sampler=dev_batch_sampler,
                                          collate_fn=batchify_fn,
                                          return_list=True)

    return model, tokenizer, dev_dataloader, dev_ds
예제 #2
0
def init_roberta_var(args):
    tokenizer = None
    if args.language == "ch":
        tokenizer = RobertaTokenizer.from_pretrained(args.from_pretrained)
    else:
        tokenizer = RobertaBPETokenizer.from_pretrained(args.from_pretrained)
    model = RobertaForSequenceClassification.from_pretrained(
        args.from_pretrained,
        hidden_dropout_prob=0,
        attention_probs_dropout_prob=0,
        dropout=0,
        num_labels=2,
        name='',
        return_inter_score=True)

    map_fn = partial(map_fn_senti, tokenizer=tokenizer, language=args.language)

    dev_ds = SentiData().read(os.path.join(args.data_dir, 'dev'),
                              args.language)
    dev_ds.map(map_fn, batched=True)
    dev_batch_sampler = paddle.io.BatchSampler(dev_ds,
                                               batch_size=args.batch_size,
                                               shuffle=False)
    batchify_fn = lambda samples, fn=Dict(
        {
            "input_ids": Pad(axis=0, pad_val=tokenizer.pad_token_id),
            "token_type_ids": Pad(axis=0, pad_val=tokenizer.pad_token_id)
        }): fn(samples)

    dataloader = paddle.io.DataLoader(dataset=dev_ds,
                                      batch_sampler=dev_batch_sampler,
                                      collate_fn=batchify_fn,
                                      return_list=True)

    return model, tokenizer, dataloader
예제 #3
0
def init_roberta_var(args):
    if args.language == 'ch':
        tokenizer = RobertaTokenizer.from_pretrained(args.from_pretrained)
    else:
        tokenizer = RobertaBPETokenizer.from_pretrained(args.from_pretrained)

    model = RobertaForQuestionAnswering.from_pretrained(args.from_pretrained)
    map_fn = functools.partial(map_fn_DuCheckList,
                               args=args,
                               tokenizer=tokenizer)
    dev_ds = RCInterpret().read(os.path.join(args.data_dir, 'dev'))
    #dev_ds = load_dataset('squad', splits='dev_v2', data_files=None)
    dev_ds.map(map_fn, batched=True)
    dev_batch_sampler = paddle.io.BatchSampler(dev_ds,
                                               batch_size=args.batch_size,
                                               shuffle=False)
    batchify_fn = lambda samples, fn=Dict(
        {
            "input_ids": Pad(axis=0, pad_val=tokenizer.pad_token_id),
            "token_type_ids": Pad(axis=0, pad_val=tokenizer.pad_token_type_id)
        }): fn(samples)

    dev_dataloader = paddle.io.DataLoader(dataset=dev_ds,
                                          batch_sampler=dev_batch_sampler,
                                          collate_fn=batchify_fn,
                                          return_list=True)

    return model, tokenizer, dev_dataloader, dev_ds
예제 #4
0
def do_train():
    """
    This function is the main part of the fine-tunning process
    """
    paddle.set_device(args.device)
    rank = paddle.distributed.get_rank()
    if paddle.distributed.get_world_size() > 1:
        paddle.distributed.init_parallel_env()

    set_seed(args.seed)
    if args.language == 'ch':
        train_ds, dev_ds = load_dataset("chnsenticorp",
                                        splits=["train", "dev"])
        if args.base_model == 'roberta_base':
            tokenizer = RobertaTokenizer.from_pretrained('roberta-wwm-ext')
            model = RobertaForSequenceClassification.from_pretrained(
                'roberta-wwm-ext', num_classes=2)
        elif args.base_model == 'roberta_large':
            tokenizer = RobertaTokenizer.from_pretrained(
                'roberta-wwm-ext-large')
            model = RobertaForSequenceClassification.from_pretrained(
                'roberta-wwm-ext-large', num_classes=2)
    else:
        train_ds, dev_ds = load_dataset('glue',
                                        "sst-2",
                                        splits=["train", "dev"])
        #for English version, we load models from local machine
        if args.base_model == 'roberta_base':
            tokenizer = RobertaBPETokenizer.from_pretrained('roberta-base')
            model = RobertaForSequenceClassification.from_pretrained(
                'roberta-base', num_classes=2)
        elif args.base_model == 'roberta_large':
            tokenizer = RobertaBPETokenizer.from_pretrained('roberta-large')
            model = RobertaForSequenceClassification.from_pretrained(
                'roberta-large', num_classes=2)

    trans_func = partial(convert_example,
                         tokenizer=tokenizer,
                         max_seq_length=args.max_seq_length,
                         language=args.language)
    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=tokenizer.pad_token_id),  # input
        Pad(axis=0, pad_val=tokenizer.pad_token_type_id),  # segment
        Stack(dtype="int64")  # label
    ): [data for data in fn(samples)]
    train_data_loader = create_dataloader(train_ds,
                                          mode='train',
                                          batch_size=args.batch_size,
                                          batchify_fn=batchify_fn,
                                          trans_fn=trans_func)
    dev_data_loader = create_dataloader(dev_ds,
                                        mode='dev',
                                        batch_size=args.batch_size,
                                        batchify_fn=batchify_fn,
                                        trans_fn=trans_func)

    if args.init_from_ckpt and os.path.isfile(args.init_from_ckpt):
        state_dict = paddle.load(args.init_from_ckpt)
        model.set_dict(state_dict)
    model = paddle.DataParallel(model)

    num_training_steps = len(train_data_loader) * args.epochs

    lr_scheduler = LinearDecayWithWarmup(args.learning_rate,
                                         num_training_steps,
                                         args.warmup_proportion)

    # Generate parameter names needed to perform weight decay.
    # All bias and LayerNorm parameters are excluded.
    decay_params = [
        p.name for n, p in model.named_parameters()
        if not any(nd in n for nd in ["bias", "norm"])
    ]
    optimizer = paddle.optimizer.AdamW(
        learning_rate=lr_scheduler,
        parameters=model.parameters(),
        weight_decay=args.weight_decay,
        apply_decay_param_fun=lambda x: x in decay_params)

    criterion = paddle.nn.loss.CrossEntropyLoss()
    metric = paddle.metric.Accuracy()

    global_step = 0
    tic_train = time.time()
    log_per_step = 100 if args.language == 'en' else 10
    for epoch in range(1, args.epochs + 1):
        for step, batch in enumerate(train_data_loader, start=1):
            input_ids, token_type_ids, labels = batch
            logits = model(input_ids=input_ids, token_type_ids=token_type_ids)
            loss = criterion(logits, labels)
            probs = F.softmax(logits, axis=1)
            correct = metric.compute(probs, labels)
            metric.update(correct)
            acc = metric.accumulate()

            global_step += 1
            if global_step % log_per_step == 0 and rank == 0:
                print(
                    "global step %d, epoch: %d, batch: %d, loss: %.5f, accu: %.5f, speed: %.2f step/s"
                    % (global_step, epoch, step, loss, acc, log_per_step /
                       (time.time() - tic_train)),
                    flush=True)
                tic_train = time.time()
            loss.backward()
            optimizer.step()
            lr_scheduler.step()
            optimizer.clear_grad()
            if global_step % (log_per_step * 10) == 0 and rank == 0:
                save_dir = os.path.join(args.save_dir,
                                        "model_%d" % global_step)
                if not os.path.exists(save_dir):
                    os.makedirs(save_dir)
                evaluate(model, criterion, metric, dev_data_loader)
                model._layers.save_pretrained(save_dir)
                tokenizer.save_pretrained(save_dir)
예제 #5
0
                tokenized_examples[i]['answerable_label'] = 1

    return tokenized_examples


if __name__ == "__main__":
    args = get_args()
    log.debug('-----------  Configuration Arguments -----------')
    for arg, value in sorted(six.iteritems(vars(args))):
        log.debug('%s: %s' % (arg, value))
    log.debug('------------------------------------------------')

    if args.language == 'ch':
        tokenizer = RobertaTokenizer.from_pretrained(args.from_pretrained)
    else:
        tokenizer = RobertaBPETokenizer.from_pretrained(args.from_pretrained)
    model = RobertaForQuestionAnswering.from_pretrained(
        args.from_pretrained, num_classes=2)

    train_ds = DuReaderChecklist().read(args.train_data_dir)
    dev_ds = DuReaderChecklist().read(args.dev_data_dir)

    train_ds.map(map_fn_DuCheckList_finetune, batched=True)
    dev_ds.map(map_fn_DuCheckList_finetune, batched=True)

    log.debug('train set: %d' % len(train_ds))
    log.debug('dev set: %d' % len(dev_ds))

    train_batch_sampler = paddle.io.DistributedBatchSampler(
        train_ds, batch_size=args.bsz, shuffle=True)
    dev_batch_sample = paddle.io.DistributedBatchSampler(