예제 #1
0
def do_train(args):
    paddle.set_device(args.device)
    if paddle.distributed.get_world_size() > 1:
        paddle.distributed.init_parallel_env()

    worker_index = paddle.distributed.get_rank()
    worker_num = paddle.distributed.get_world_size()
    local_rank = int(os.getenv("PADDLE_RANK_IN_NODE", 0))
    set_seed(args)
    # Now, we only support data parallel in dygraph mode for now.
    topo = Topology(device_rank=worker_index,
                    world_size=worker_num,
                    dp_degree=worker_num)

    default_global_batch_size = topo.data_info.size * args.micro_batch_size
    default_global_tokens_num = default_global_batch_size * args.max_seq_len

    model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
    tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path)

    # Define log writer
    log_writer_path = os.path.join(
        args.output_dir, "train_log",
        "{}_globalbsz_{}_amp_{}_recompute_{}_card_{}".format(
            args.model_name_or_path,
            args.micro_batch_size * topo.data_info.size, False, False,
            worker_index).lower())
    if os.path.exists(log_writer_path):
        import shutil
        shutil.rmtree(log_writer_path)
    log_writer = LogWriter(log_writer_path)

    pretrained_models_list = list(
        model_class.pretrained_init_configuration.keys())
    if args.model_name_or_path in pretrained_models_list:
        model_config = model_class.pretrained_init_configuration[
            args.model_name_or_path]
        model_config["hidden_dropout_prob"] = args.hidden_dropout_prob
        model_config[
            "attention_probs_dropout_prob"] = args.attention_probs_dropout_prob
        model = GPTForPretraining(GPTModel(**model_config))
    else:
        model = GPTForPretraining.from_pretrained(
            args.model_name_or_path,
            hidden_dropout_prob=args.hidden_dropout_prob,
            attention_probs_dropout_prob=args.attention_probs_dropout_prob)

    # Create the critrion for the gpt model
    criterion = GPTPretrainingCriterion()

    if paddle.distributed.get_world_size() > 1:
        model = paddle.DataParallel(model)

    if args.decay_steps is None:
        args.decay_steps = args.max_steps
    warmup_step = args.warmup_rate * args.decay_steps

    lr_scheduler = None

    if args.lr_decay_style == "none":
        lr_scheduler = None
    elif args.lr_decay_style == "cosine":
        lr_scheduler = lr.CosineAnnealingWithWarmupDecay(
            max_lr=args.max_lr,
            min_lr=args.min_lr,
            warmup_step=warmup_step,
            decay_step=args.decay_steps)

    clip = None
    if args.grad_clip > 0:
        clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=args.grad_clip)

    # Generate parameter names needed to perform weight decay.
    # All bias and LayerNorm parameters are excluded.
    decay_params = [
        p.name for n, p in model.named_parameters()
        if not any(nd in n for nd in ["bias", "norm"])
    ]
    optimizer = paddle.optimizer.AdamW(
        learning_rate=lr_scheduler
        if lr_scheduler is not None else args.max_lr,
        beta1=args.adam_beta1,
        beta2=args.adam_beta2,
        epsilon=args.adam_epsilon,
        parameters=model.parameters(),
        weight_decay=args.weight_decay,
        grad_clip=clip,
        apply_decay_param_fun=lambda x: x in decay_params)

    if args.use_amp:
        scaler = paddle.amp.GradScaler(init_loss_scaling=args.scale_loss)

    if args.model_name_or_path not in pretrained_models_list:
        logger.info("Try to load checkpoint from %s " %
                    args.model_name_or_path)
        opt_path = os.path.join(args.model_name_or_path, "model_state.pdopt")
        if os.path.exists(opt_path):
            opt_dict = paddle.load(opt_path)
            optimizer.set_state_dict(opt_dict)
        else:
            logger.warning("No optimizer checkpoint file found in %s." %
                           opt_path)

    global_step = 0
    epoch = 0
    tic_train = time.time()
    while True:
        files = get_train_data_file(args)
        files.sort()
        num_files = len(files)
        for f_id in range(num_files):
            data_file = files[f_id]
            train_data_loader, valid_data_loader, test_data_loader = create_pretrained_dataset(
                args, [data_file],
                local_rank=local_rank,
                data_world_size=topo.data_info.size,
                data_world_rank=topo.data_info.rank,
                eos_id=tokenizer.eos_token_id)
            # Bug fix, if not call valid_data_loader, the enumerate will call valid_data_loader
            # many times. and start a new random dataloader.
            valid_data_loader = valid_data_loader()
            test_data_loader = test_data_loader()

            # time count
            train_reader_cost = 0.0
            train_run_cost = 0.0
            reader_start = time.time()
            for step, batch in enumerate(train_data_loader()):
                train_reader_cost += time.time() - reader_start
                train_start = time.time()

                global_step += 1
                tokens, loss_mask, attention_mask, position_ids, labels = batch
                loss_mask.stop_gradient = True
                attention_mask.stop_gradient = True
                with paddle.amp.auto_cast(
                        args.use_amp,
                        custom_white_list=["layer_norm", "softmax", "gelu"],
                        custom_black_list=[
                            "reduce_sum", "c_softmax_with_cross_entropy",
                            "c_embedding"
                        ]):

                    preds = model(tokens, position_ids, attention_mask)
                    loss = criterion(preds, labels, loss_mask)

                if args.use_amp:
                    scaler.scale(loss).backward()
                    scaler.minimize(optimizer, loss)
                else:
                    loss.backward()
                    optimizer.step()

                if lr_scheduler is not None:
                    lr_scheduler.step()
                optimizer.clear_grad()

                loss_numpy = loss.numpy()
                train_run_cost += time.time() - train_start

                # Profile for model benchmark
                profiler.add_profiler_step(args.profiler_options)

                if global_step % args.logging_freq == 0:
                    speed = args.logging_freq / (train_reader_cost +
                                                 train_run_cost)
                    avg_reader_cost = train_reader_cost / args.logging_freq
                    logger.info(
                        "global step %d, epoch: %d, batch: %d, loss: %.9f, avg_reader_cost: %.5f sec, avg_batch_cost: %.5f sec, speed: %.2f step/s, ips: %.0f tokens/s, learning rate: %.5e"
                        % (global_step, epoch, step, loss_numpy,
                           avg_reader_cost, 1. / speed, speed, speed *
                           default_global_tokens_num, optimizer.get_lr()))
                    log_writer.add_scalar("loss", loss_numpy, global_step)
                    log_writer.add_scalar("learning_rate", optimizer.get_lr(),
                                          global_step)

                    tic_train = time.time()
                    train_reader_cost = 0.0
                    train_run_cost = 0.0

                if args.check_accuracy:
                    if global_step >= args.max_steps:
                        return
                    else:
                        continue

                if global_step % args.eval_freq == 0:
                    # Since the valid data broardcast to all devices, we do evaluate on all device.
                    run_evaluate(valid_data_loader, model, criterion,
                                 args.eval_iters, log_writer, global_step,
                                 epoch, "valid")

                if global_step % args.save_steps == 0 or global_step >= args.max_steps:
                    if worker_index == 0:
                        output_dir = os.path.join(args.output_dir,
                                                  "model_%d" % global_step)
                        if not os.path.exists(output_dir):
                            os.makedirs(output_dir)
                        # Need better way to get inner model of DataParallel
                        model_to_save = model._layers if isinstance(
                            model, paddle.DataParallel) else model
                        logger.info("Save model to %s" % output_dir)
                        model_to_save.save_pretrained(output_dir)
                        tokenizer.save_pretrained(output_dir)
                        paddle.save(
                            optimizer.state_dict(),
                            os.path.join(output_dir, "model_state.pdopt"))

                if global_step >= args.max_steps:
                    run_evaluate(test_data_loader, model, criterion,
                                 args.test_iters, log_writer, global_step,
                                 epoch, "test")
                    logger.info("The training process is complete.")
                    del train_data_loader
                    return

                reader_start = time.time()

            del train_data_loader
예제 #2
0
def do_train():
    paddle.set_device(args.device)
    rank = paddle.distributed.get_rank()
    if paddle.distributed.get_world_size() > 1:
        paddle.distributed.init_parallel_env()

    set_seed(args.seed)

    train_ds, dev_ds = load_dataset("chnsenticorp", splits=["train", "dev"])

    # If you wanna use bert/roberta/electra pretrained model,
    # model = ppnlp.transformers.BertForSequenceClassification.from_pretrained('bert-base-chinese', num_class=2)
    # model = ppnlp.transformers.RobertaForSequenceClassification.from_pretrained('roberta-wwm-ext', num_class=2)
    # model = ppnlp.transformers.ElectraForSequenceClassification.from_pretrained('chinese-electra-small', num_classes=2)
    model = ppnlp.transformers.ErnieForSequenceClassification.from_pretrained(
        'ernie-tiny', num_classes=len(train_ds.label_list))

    # If you wanna use bert/roberta/electra pretrained model,
    # tokenizer = ppnlp.transformers.BertTokenizer.from_pretrained('bert-base-chinese')
    # tokenizer = ppnlp.transformers.RobertaTokenizer.from_pretrained('roberta-wwm-ext')
    # tokenizer = ppnlp.transformers.ElectraTokenizer.from_pretrained('chinese-electra-small', num_classes=2)
    # ErnieTinyTokenizer is special for ernie-tiny pretained model.
    tokenizer = ppnlp.transformers.ErnieTinyTokenizer.from_pretrained(
        'ernie-tiny')

    trans_func = partial(
        convert_example,
        tokenizer=tokenizer,
        max_seq_length=args.max_seq_length)
    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=tokenizer.pad_token_id),  # input
        Pad(axis=0, pad_val=tokenizer.pad_token_type_id),  # segment
        Stack(dtype="int64")  # label
    ): [data for data in fn(samples)]
    train_data_loader = create_dataloader(
        train_ds,
        mode='train',
        batch_size=args.batch_size,
        batchify_fn=batchify_fn,
        trans_fn=trans_func)
    dev_data_loader = create_dataloader(
        dev_ds,
        mode='dev',
        batch_size=args.batch_size,
        batchify_fn=batchify_fn,
        trans_fn=trans_func)

    if args.init_from_ckpt and os.path.isfile(args.init_from_ckpt):
        state_dict = paddle.load(args.init_from_ckpt)
        model.set_dict(state_dict)
    model = paddle.DataParallel(model)

    num_training_steps = len(train_data_loader) * args.epochs

    lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps,
                                         args.warmup_proportion)

    # Generate parameter names needed to perform weight decay.
    # All bias and LayerNorm parameters are excluded.
    decay_params = [
        p.name for n, p in model.named_parameters()
        if not any(nd in n for nd in ["bias", "norm"])
    ]
    optimizer = paddle.optimizer.AdamW(
        learning_rate=lr_scheduler,
        parameters=model.parameters(),
        weight_decay=args.weight_decay,
        apply_decay_param_fun=lambda x: x in decay_params)

    criterion = paddle.nn.loss.CrossEntropyLoss()
    metric = paddle.metric.Accuracy()

    global_step = 0
    tic_train = time.time()
    for epoch in range(1, args.epochs + 1):
        for step, batch in enumerate(train_data_loader, start=1):
            input_ids, token_type_ids, labels = batch
            logits = model(input_ids, token_type_ids)
            loss = criterion(logits, labels)
            probs = F.softmax(logits, axis=1)
            correct = metric.compute(probs, labels)
            metric.update(correct)
            acc = metric.accumulate()

            global_step += 1
            if global_step % 10 == 0 and rank == 0:
                print(
                    "global step %d, epoch: %d, batch: %d, loss: %.5f, accu: %.5f, speed: %.2f step/s"
                    % (global_step, epoch, step, loss, acc,
                       10 / (time.time() - tic_train)))
                tic_train = time.time()
            loss.backward()
            optimizer.step()
            lr_scheduler.step()
            optimizer.clear_grad()
            if global_step % 100 == 0 and rank == 0:
                save_dir = os.path.join(args.save_dir, "model_%d" % global_step)
                if not os.path.exists(save_dir):
                    os.makedirs(save_dir)
                evaluate(model, criterion, metric, dev_data_loader)
                model._layers.save_pretrained(save_dir)
                tokenizer.save_pretrained(save_dir)
예제 #3
0
def do_train(args):
    paddle.set_device(args.device)
    if paddle.distributed.get_world_size() > 1:
        paddle.distributed.init_parallel_env()

    # Create dataset, tokenizer and dataloader.
    train_ds, test_ds = load_dataset('msra_ner',
                                     splits=('train', 'test'),
                                     lazy=False)

    tokenizer = GPTChineseTokenizer.from_pretrained(args.model_name_or_path)

    label_list = train_ds.label_list
    label_num = len(label_list)
    no_entity_id = label_num - 1

    trans_func = partial(tokenize_and_align_labels,
                         tokenizer=tokenizer,
                         no_entity_id=no_entity_id,
                         max_seq_len=args.max_seq_length)

    train_ds = train_ds.map(trans_func)

    ignore_label = -100

    batchify_fn = lambda samples, fn=Dict(
        {
            'input_ids': Pad(axis=0, pad_val=0, dtype='int64'),  # input
            'seq_len': Stack(dtype='int64'),  # seq_len
            'labels': Pad(axis=0, pad_val=ignore_label, dtype='int64')  # label
        }): fn(samples)

    train_batch_sampler = paddle.io.DistributedBatchSampler(
        train_ds, batch_size=args.batch_size, shuffle=True, drop_last=True)

    train_data_loader = DataLoader(dataset=train_ds,
                                   collate_fn=batchify_fn,
                                   num_workers=0,
                                   batch_sampler=train_batch_sampler,
                                   return_list=True)

    test_ds = test_ds.map(trans_func)

    test_data_loader = DataLoader(dataset=test_ds,
                                  collate_fn=batchify_fn,
                                  num_workers=0,
                                  batch_size=args.batch_size,
                                  return_list=True)

    # Define the model netword and its loss
    model = GPTForTokenClassification.from_pretrained(args.model_name_or_path,
                                                      num_classes=label_num)
    #model = ErnieCtmForTokenClassification.from_pretrained(
    #    args.model_name_or_path, num_classes=label_num)
    if paddle.distributed.get_world_size() > 1:
        model = paddle.DataParallel(model)

    num_training_steps = args.max_steps if args.max_steps > 0 else len(
        train_data_loader) * args.num_train_epochs

    lr_scheduler = LinearDecayWithWarmup(args.learning_rate,
                                         num_training_steps, args.warmup_steps)

    # Generate parameter names needed to perform weight decay.
    # All bias and LayerNorm parameters are excluded.
    decay_params = [
        p.name for n, p in model.named_parameters()
        if not any(nd in n for nd in ["bias", "norm"])
    ]
    optimizer = paddle.optimizer.AdamW(
        learning_rate=lr_scheduler,
        epsilon=args.adam_epsilon,
        parameters=model.parameters(),
        weight_decay=args.weight_decay,
        apply_decay_param_fun=lambda x: x in decay_params)

    loss_fct = paddle.nn.loss.CrossEntropyLoss(ignore_index=ignore_label)

    metric = ChunkEvaluator(label_list=label_list)

    global_step = 0
    last_step = args.num_train_epochs * len(train_data_loader)
    tic_train = time.time()
    for epoch in range(args.num_train_epochs):
        for step, batch in enumerate(train_data_loader):
            global_step += 1
            input_ids, _, labels = batch
            logits = model(input_ids)
            loss = loss_fct(logits, labels)
            loss.backward()
            optimizer.step()
            lr_scheduler.step()
            optimizer.clear_grad()

            if global_step % args.logging_steps == 0:
                print(
                    "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s"
                    % (global_step, epoch, step, loss, args.logging_steps /
                       (time.time() - tic_train)))
                tic_train = time.time()

            if global_step % args.save_steps == 0 or global_step == last_step:
                if paddle.distributed.get_rank() == 0:
                    evaluate(model, loss_fct, metric, test_data_loader)
                    paddle.save(
                        model.state_dict(),
                        os.path.join(args.output_dir,
                                     "model_%d.pdparams" % global_step))
예제 #4
0
def do_train(args):
    paddle.set_device(args.device)
    if paddle.distributed.get_world_size() > 1:
        paddle.distributed.init_parallel_env()

    set_seed(args)

    args.task_name = args.task_name.lower()
    metric_class = METRIC_CLASSES[args.task_name]
    args.model_type = args.model_type.lower()
    model_class, tokenizer_class = MODEL_CLASSES[args.model_type]

    train_ds = load_dataset('glue', args.task_name, splits="train")
    tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path)

    trans_func = partial(convert_example,
                         tokenizer=tokenizer,
                         label_list=train_ds.label_list,
                         max_seq_length=args.max_seq_length)
    train_ds = train_ds.map(trans_func, lazy=True)
    train_batch_sampler = paddle.io.DistributedBatchSampler(
        train_ds, batch_size=args.batch_size, shuffle=True)
    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=tokenizer.pad_token_id),  # input
        Pad(axis=0, pad_val=tokenizer.pad_token_type_id),  # segment
        Stack(dtype="int64" if train_ds.label_list else "float32")  # label
    ): fn(samples)
    train_data_loader = DataLoader(dataset=train_ds,
                                   batch_sampler=train_batch_sampler,
                                   collate_fn=batchify_fn,
                                   num_workers=0,
                                   return_list=True)
    if args.task_name == "mnli":
        dev_ds_matched, dev_ds_mismatched = load_dataset(
            'glue', args.task_name, splits=["dev_matched", "dev_mismatched"])

        dev_ds_matched = dev_ds_matched.map(trans_func, lazy=True)
        dev_ds_mismatched = dev_ds_mismatched.map(trans_func, lazy=True)
        dev_batch_sampler_matched = paddle.io.BatchSampler(
            dev_ds_matched, batch_size=args.batch_size, shuffle=False)
        dev_data_loader_matched = DataLoader(
            dataset=dev_ds_matched,
            batch_sampler=dev_batch_sampler_matched,
            collate_fn=batchify_fn,
            num_workers=0,
            return_list=True)
        dev_batch_sampler_mismatched = paddle.io.BatchSampler(
            dev_ds_mismatched, batch_size=args.batch_size, shuffle=False)
        dev_data_loader_mismatched = DataLoader(
            dataset=dev_ds_mismatched,
            batch_sampler=dev_batch_sampler_mismatched,
            collate_fn=batchify_fn,
            num_workers=0,
            return_list=True)
    else:
        dev_ds = load_dataset('glue', args.task_name, splits='dev')
        dev_ds = dev_ds.map(trans_func, lazy=True)
        dev_batch_sampler = paddle.io.BatchSampler(dev_ds,
                                                   batch_size=args.batch_size,
                                                   shuffle=False)
        dev_data_loader = DataLoader(dataset=dev_ds,
                                     batch_sampler=dev_batch_sampler,
                                     collate_fn=batchify_fn,
                                     num_workers=0,
                                     return_list=True)

    num_classes = 1 if train_ds.label_list == None else len(
        train_ds.label_list)
    model = model_class.from_pretrained(args.model_name_or_path,
                                        num_classes=num_classes)
    if paddle.distributed.get_world_size() > 1:
        model = paddle.DataParallel(model)

    num_training_steps = args.max_steps if args.max_steps > 0 else (
        len(train_data_loader) * args.num_train_epochs)
    warmup = args.warmup_steps if args.warmup_steps > 0 else args.warmup_proportion

    lr_scheduler = LinearDecayWithWarmup(args.learning_rate,
                                         num_training_steps, warmup)

    # Generate parameter names needed to perform weight decay.
    # All bias and LayerNorm parameters are excluded.
    decay_params = [
        p.name for n, p in model.named_parameters()
        if not any(nd in n for nd in ["bias", "norm"])
    ]
    optimizer = paddle.optimizer.AdamW(
        learning_rate=lr_scheduler,
        beta1=0.9,
        beta2=0.999,
        epsilon=args.adam_epsilon,
        parameters=model.parameters(),
        weight_decay=args.weight_decay,
        apply_decay_param_fun=lambda x: x in decay_params)

    loss_fct = paddle.nn.loss.CrossEntropyLoss(
    ) if train_ds.label_list else paddle.nn.loss.MSELoss()

    metric = metric_class()

    global_step = 0
    tic_train = time.time()
    for epoch in range(args.num_train_epochs):
        for step, batch in enumerate(train_data_loader):
            global_step += 1

            input_ids, segment_ids, labels = batch
            logits = model(input_ids, segment_ids)
            loss = loss_fct(logits, labels)
            loss.backward()
            optimizer.step()
            lr_scheduler.step()
            optimizer.clear_grad()
            if global_step % args.logging_steps == 0 or global_step == num_training_steps:
                print(
                    "global step %d/%d, epoch: %d, batch: %d, rank_id: %s, loss: %f, lr: %.10f, speed: %.4f step/s"
                    % (global_step, num_training_steps, epoch, step,
                       paddle.distributed.get_rank(), loss, optimizer.get_lr(),
                       args.logging_steps / (time.time() - tic_train)))
                tic_train = time.time()
            if global_step % args.save_steps == 0 or global_step == num_training_steps:
                tic_eval = time.time()
                if args.task_name == "mnli":
                    evaluate(model, loss_fct, metric, dev_data_loader_matched)
                    evaluate(model, loss_fct, metric,
                             dev_data_loader_mismatched)
                    print("eval done total : %s s" % (time.time() - tic_eval))
                else:
                    evaluate(model, loss_fct, metric, dev_data_loader)
                    print("eval done total : %s s" % (time.time() - tic_eval))
                if paddle.distributed.get_rank() == 0:
                    output_dir = os.path.join(
                        args.output_dir, "%s_ft_model_%d.pdparams" %
                        (args.task_name, global_step))
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                    # Need better way to get inner model of DataParallel
                    model_to_save = model._layers if isinstance(
                        model, paddle.DataParallel) else model
                    model_to_save.save_pretrained(output_dir)
                    tokenizer.save_pretrained(output_dir)
            if global_step >= num_training_steps:
                return
예제 #5
0
파일: train.py 프로젝트: Ginnna/PaddleNLP
def do_train(args):
    if args.device == "gpu":
        rank = dist.get_rank()
        trainer_count = dist.get_world_size()
    else:
        rank = 0
        trainer_count = 1
        paddle.set_device("cpu")

    if trainer_count > 1:
        dist.init_parallel_env()

    # Set seed for CE
    random_seed = eval(str(args.random_seed))
    if random_seed is not None:
        paddle.seed(random_seed)

    # Define data loader
    (train_loader), (eval_loader) = reader.create_data_loader(args)

    # Define model
    transformer = TransformerModel(src_vocab_size=args.src_vocab_size,
                                   trg_vocab_size=args.trg_vocab_size,
                                   max_length=args.max_length + 1,
                                   n_layer=args.n_layer,
                                   n_head=args.n_head,
                                   d_model=args.d_model,
                                   d_inner_hid=args.d_inner_hid,
                                   dropout=args.dropout,
                                   weight_sharing=args.weight_sharing,
                                   bos_id=args.bos_idx,
                                   eos_id=args.eos_idx)

    # Define loss
    criterion = CrossEntropyCriterion(args.label_smooth_eps, args.bos_idx)

    scheduler = paddle.optimizer.lr.NoamDecay(args.d_model,
                                              args.warmup_steps,
                                              args.learning_rate,
                                              last_epoch=0)

    # Define optimizer
    optimizer = paddle.optimizer.Adam(learning_rate=scheduler,
                                      beta1=args.beta1,
                                      beta2=args.beta2,
                                      epsilon=float(args.eps),
                                      parameters=transformer.parameters())

    # Init from some checkpoint, to resume the previous training
    if args.init_from_checkpoint:
        model_dict = paddle.load(
            os.path.join(args.init_from_checkpoint, "transformer.pdparams"))
        opt_dict = paddle.load(
            os.path.join(args.init_from_checkpoint, "transformer.pdopt"))
        transformer.set_state_dict(model_dict)
        optimizer.set_state_dict(opt_dict)
        print("loaded from checkpoint.")
    # Init from some pretrain models, to better solve the current task
    if args.init_from_pretrain_model:
        model_dict = paddle.load(
            os.path.join(args.init_from_pretrain_model,
                         "transformer.pdparams"))
        transformer.set_state_dict(model_dict)
        print("loaded from pre-trained model.")

    if trainer_count > 1:
        transformer = paddle.DataParallel(transformer)

    # The best cross-entropy value with label smoothing
    loss_normalizer = -(
        (1. - args.label_smooth_eps) * np.log((1. - args.label_smooth_eps)) +
        args.label_smooth_eps * np.log(args.label_smooth_eps /
                                       (args.trg_vocab_size - 1) + 1e-20))

    step_idx = 0

    # For benchmark
    reader_cost_avg = AverageStatistical()
    batch_cost_avg = AverageStatistical()
    batch_ips_avg = AverageStatistical()

    # Train loop
    for pass_id in range(args.epoch):
        epoch_start = time.time()

        batch_id = 0
        batch_start = time.time()
        for input_data in train_loader:
            #NOTE: Used for benchmark and use None as default.
            if args.max_iter and step_idx == args.max_iter:
                return
            train_reader_cost = time.time() - batch_start
            (src_word, trg_word, lbl_word) = input_data

            if args.use_amp:
                scaler = paddle.amp.GradScaler(
                    init_loss_scaling=args.scale_loss)
                with paddle.amp.auto_cast():
                    logits = transformer(src_word=src_word, trg_word=trg_word)
                    sum_cost, avg_cost, token_num = criterion(logits, lbl_word)

                scaled = scaler.scale(avg_cost)  # scale the loss
                scaled.backward()  # do backward

                scaler.minimize(optimizer, scaled)  # update parameters
                optimizer.clear_grad()
            else:
                logits = transformer(src_word=src_word, trg_word=trg_word)
                sum_cost, avg_cost, token_num = criterion(logits, lbl_word)

                avg_cost.backward()

                optimizer.step()
                optimizer.clear_grad()

            tokens_per_cards = token_num.numpy()

            train_batch_cost = time.time() - batch_start
            reader_cost_avg.record(train_reader_cost)
            batch_cost_avg.record(train_batch_cost)
            batch_ips_avg.record(train_batch_cost, tokens_per_cards)

            # NOTE: For benchmark, loss infomation on all cards will be printed.
            if step_idx % args.print_step == 0 and (args.benchmark
                                                    or rank == 0):
                total_avg_cost = avg_cost.numpy()

                if step_idx == 0:
                    logger.info(
                        "step_idx: %d, epoch: %d, batch: %d, avg loss: %f, "
                        "normalized loss: %f, ppl: %f " %
                        (step_idx, pass_id, batch_id, total_avg_cost,
                         total_avg_cost - loss_normalizer,
                         np.exp([min(total_avg_cost, 100)])))
                else:
                    train_avg_batch_cost = args.print_step / batch_cost_avg.get_total_time(
                    )
                    logger.info(
                        "step_idx: %d, epoch: %d, batch: %d, avg loss: %f, "
                        "normalized loss: %f, ppl: %f, avg_speed: %.2f step/sec, "
                        "batch_cost: %.5f sec, reader_cost: %.5f sec, tokens: %d, "
                        "ips: %.5f words/sec" %
                        (step_idx, pass_id, batch_id, total_avg_cost,
                         total_avg_cost - loss_normalizer,
                         np.exp([min(total_avg_cost, 100)]),
                         train_avg_batch_cost, batch_cost_avg.get_average(),
                         reader_cost_avg.get_average(),
                         batch_ips_avg.get_total_cnt(),
                         batch_ips_avg.get_average_per_sec()))
                reader_cost_avg.reset()
                batch_cost_avg.reset()
                batch_ips_avg.reset()

            if step_idx % args.save_step == 0 and step_idx != 0:
                # Validation
                transformer.eval()
                total_sum_cost = 0
                total_token_num = 0
                with paddle.no_grad():
                    for input_data in eval_loader:
                        (src_word, trg_word, lbl_word) = input_data
                        logits = transformer(src_word=src_word,
                                             trg_word=trg_word)
                        sum_cost, avg_cost, token_num = criterion(
                            logits, lbl_word)
                        total_sum_cost += sum_cost.numpy()
                        total_token_num += token_num.numpy()
                        total_avg_cost = total_sum_cost / total_token_num
                    logger.info(
                        "validation, step_idx: %d, avg loss: %f, "
                        "normalized loss: %f, ppl: %f" %
                        (step_idx, total_avg_cost, total_avg_cost -
                         loss_normalizer, np.exp([min(total_avg_cost, 100)])))
                transformer.train()

                if args.save_model and rank == 0:
                    model_dir = os.path.join(args.save_model,
                                             "step_" + str(step_idx))
                    if not os.path.exists(model_dir):
                        os.makedirs(model_dir)
                    paddle.save(
                        transformer.state_dict(),
                        os.path.join(model_dir, "transformer.pdparams"))
                    paddle.save(optimizer.state_dict(),
                                os.path.join(model_dir, "transformer.pdopt"))

            batch_id += 1
            step_idx += 1
            scheduler.step()
            batch_start = time.time()

        train_epoch_cost = time.time() - epoch_start
        logger.info("train epoch: %d, epoch_cost: %.5f s" %
                    (pass_id, train_epoch_cost))

    if args.save_model and rank == 0:
        model_dir = os.path.join(args.save_model, "step_final")
        if not os.path.exists(model_dir):
            os.makedirs(model_dir)
        paddle.save(transformer.state_dict(),
                    os.path.join(model_dir, "transformer.pdparams"))
        paddle.save(optimizer.state_dict(),
                    os.path.join(model_dir, "transformer.pdopt"))
예제 #6
0
def do_train(args):
    paddle.set_device(args.device)
    if paddle.distributed.get_world_size() > 1:
        paddle.distributed.init_parallel_env()

    worker_index = paddle.distributed.get_rank()
    worker_num = paddle.distributed.get_world_size()
    set_seed(args)
    worker_init = WorkerInitObj(args.seed + paddle.distributed.get_rank())
    model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
    tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path)
    eod_id = tokenizer.command_name_map["eod"].Id

    pretrained_models_list = list(
        model_class.pretrained_init_configuration.keys())
    if args.model_name_or_path in pretrained_models_list:
        model = GPT2ForPretraining(
            GPT2Model(**model_class.pretrained_init_configuration[
                args.model_name_or_path]))
    else:
        model = GPT2ForPretraining.from_pretrained(args.model_name_or_path)

    # creat the critrion for the gpt model
    criterion = GPT2PretrainingCriterion()

    if paddle.distributed.get_world_size() > 1:
        model = paddle.DataParallel(model)

    if args.decay_steps is None:
        args.decay_steps = args.max_steps
    warmup_step = args.warmup_rate * args.decay_steps
    lr_scheduler = lr.CosineAnnealingWithWarmupDecay(
        max_lr=args.max_lr,
        min_lr=args.min_lr,
        warmup_step=warmup_step,
        decay_step=args.decay_steps)

    clip = None
    if args.grad_clip > 0:
        clip = paddle.nn.ClipGradByNorm(clip_norm=args.grad_clip)

    # Generate parameter names needed to perform weight decay.
    # All bias and LayerNorm parameters are excluded.
    decay_params = [
        p.name for n, p in model.named_parameters()
        if not any(nd in n for nd in ["bias", "norm"])
    ]
    optimizer = paddle.optimizer.AdamW(
        learning_rate=lr_scheduler,
        epsilon=args.adam_epsilon,
        parameters=model.parameters(),
        weight_decay=args.weight_decay,
        grad_clip=clip,
        apply_decay_param_fun=lambda x: x in decay_params)
    if args.model_name_or_path not in pretrained_models_list:
        opt_dict = paddle.load(
            os.path.join(args.model_name_or_path, "model_state.pdopt"))
        optimizer.set_state_dict(opt_dict)

    global_step = 0
    tic_train = time.time()
    for epoch in range(args.num_train_epochs):
        files = [
            os.path.join(args.input_dir, f) for f in os.listdir(args.input_dir)
            if (os.path.isfile(os.path.join(args.input_dir, f)) and "npz_"
                not in str(f))
        ]
        files.sort()
        num_files = len(files)
        for f_id in range(num_files):
            data_file = files[f_id]
            train_data_loader = create_pretrained_dataset(
                args,
                data_file,
                worker_init,
                worker_index,
                worker_num,
                eod_id=eod_id)
            for step, batch in enumerate(train_data_loader):
                global_step += 1
                tokens, loss_mask, attention_mask, position_ids, labels = batch

                loss_mask.stop_gradient = True
                attention_mask.stop_gradient = True

                preds = model(tokens, position_ids, attention_mask)
                loss = criterion(preds, labels, loss_mask)

                if global_step % args.logging_steps == 0:
                    if worker_index == 0:
                        logger.info(
                            "global step %d, epoch: %d, lr: %.10f, batch: %d, loss: %f, speed: %.2f step/s"
                            % (global_step, epoch, optimizer.get_lr(), step,
                               loss,
                               args.logging_steps / (time.time() - tic_train)))
                    tic_train = time.time()
                loss.backward()
                optimizer.step()
                lr_scheduler.step()
                optimizer.clear_grad()
                if global_step % args.save_steps == 0 or global_step >= args.max_steps:
                    if worker_index == 0:
                        output_dir = os.path.join(args.output_dir,
                                                  "model_%d" % global_step)
                        if not os.path.exists(output_dir):
                            os.makedirs(output_dir)
                        # need better way to get inner model of DataParallel
                        model_to_save = model._layers if isinstance(
                            model, paddle.DataParallel) else model
                        logger.info("Save model to %s" % output_dir)
                        model_to_save.save_pretrained(output_dir)
                        tokenizer.save_pretrained(output_dir)
                        paddle.save(
                            optimizer.state_dict(),
                            os.path.join(output_dir, "model_state.pdopt"))
                if global_step >= args.max_steps:
                    logger.info("The training process is complete.")
                    del train_data_loader
                    return

            del train_data_loader
예제 #7
0
def main(args):
    paddle.seed(12345)

    config = get_config(args.config, overrides=args.override, show=True)
    # assign the place
    use_gpu = config.get("use_gpu", True)
    place = paddle.set_device('gpu' if use_gpu else 'cpu')

    trainer_num = paddle.distributed.get_world_size()
    use_data_parallel = trainer_num != 1
    config["use_data_parallel"] = use_data_parallel

    if config["use_data_parallel"]:
        paddle.distributed.init_parallel_env()

    net = program.create_model(config.ARCHITECTURE, config.classes_num)

    # prepare to quant
    quant_config = get_default_quant_config()
    quant_config["activation_preprocess_type"] = "PACT"
    quanter = QAT(config=quant_config)
    quanter.quantize(net)

    optimizer, lr_scheduler = program.create_optimizer(
        config, parameter_list=net.parameters())

    init_model(config, net, optimizer)

    if config["use_data_parallel"]:
        net = paddle.DataParallel(net)

    train_dataloader = Reader(config, 'train', places=place)()

    if config.validate:
        valid_dataloader = Reader(config, 'valid', places=place)()

    last_epoch_id = config.get("last_epoch", -1)
    best_top1_acc = 0.0  # best top1 acc record
    best_top1_epoch = last_epoch_id
    for epoch_id in range(last_epoch_id + 1, config.epochs):
        net.train()
        # 1. train with train dataset
        program.run(train_dataloader, config, net, optimizer, lr_scheduler,
                    epoch_id, 'train')

        # 2. validate with validate dataset
        if config.validate and epoch_id % config.valid_interval == 0:
            net.eval()
            with paddle.no_grad():
                top1_acc = program.run(valid_dataloader, config, net, None,
                                       None, epoch_id, 'valid')
            if top1_acc > best_top1_acc:
                best_top1_acc = top1_acc
                best_top1_epoch = epoch_id
                model_path = os.path.join(config.model_save_dir,
                                          config.ARCHITECTURE["name"])
                save_model(net, optimizer, model_path, "best_model")
            message = "The best top1 acc {:.5f}, in epoch: {:d}".format(
                best_top1_acc, best_top1_epoch)
            logger.info(message)

        # 3. save the persistable model
        if epoch_id % config.save_interval == 0:
            model_path = os.path.join(config.model_save_dir,
                                      config.ARCHITECTURE["name"])
            save_model(net, optimizer, model_path, epoch_id)
예제 #8
0
def do_train(args):
    paddle.set_device(args.device)
    rank = paddle.distributed.get_rank()
    if paddle.distributed.get_world_size() > 1:
        paddle.distributed.init_parallel_env()

    set_seed(args.seed)

    train_ds = load_dataset(read_custom_data,
                            filename=os.path.join(args.data_dir, "train.txt"),
                            is_test=False,
                            lazy=False)
    dev_ds = load_dataset(read_custom_data,
                          filename=os.path.join(args.data_dir, "dev.txt"),
                          is_test=False,
                          lazy=False)
    tags_to_idx = load_dict(os.path.join(args.data_dir, "tags.txt"))

    tokenizer = ErnieCtmTokenizer.from_pretrained("wordtag")
    model = ErnieCtmWordtagModel.from_pretrained("wordtag",
                                                 num_tag=len(tags_to_idx))
    model.crf_loss = LinearChainCrfLoss(
        LinearChainCrf(len(tags_to_idx), 0.1, with_start_stop_tag=False))

    trans_func = partial(convert_example,
                         tokenizer=tokenizer,
                         max_seq_len=args.max_seq_len,
                         tags_to_idx=tags_to_idx)

    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype='int64'
            ),  # input_ids
        Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype='int64'
            ),  # token_type_ids
        Stack(dtype='int64'),  # seq_len
        Pad(axis=0, pad_val=tags_to_idx["O"], dtype='int64'),  # tags
    ): fn(samples)

    train_data_loader = create_dataloader(train_ds,
                                          mode="train",
                                          batch_size=args.batch_size,
                                          batchify_fn=batchify_fn,
                                          trans_fn=trans_func)

    dev_data_loader = create_dataloader(dev_ds,
                                        mode="dev",
                                        batch_size=args.batch_size,
                                        batchify_fn=batchify_fn,
                                        trans_fn=trans_func)

    if args.init_from_ckpt and os.path.isfile(args.init_from_ckpt):
        state_dict = paddle.load(args.init_from_ckpt)
        model.set_dict(state_dict)

    if paddle.distributed.get_world_size() > 1:
        model = paddle.DataParallel(model)

    num_training_steps = len(train_data_loader) * args.num_train_epochs
    warmup = args.warmup_steps if args.warmup_steps > 0 else args.warmup_proportion
    lr_scheduler = LinearDecayWithWarmup(args.learning_rate,
                                         num_training_steps, warmup)

    num_train_optimization_steps = len(
        train_ds) / args.batch_size * args.num_train_epochs

    decay_params = [
        p.name for n, p in model.named_parameters()
        if not any(nd in n for nd in ["bias", "norm"])
    ]
    optimizer = paddle.optimizer.AdamW(
        learning_rate=lr_scheduler,
        epsilon=args.adam_epsilon,
        parameters=model.parameters(),
        weight_decay=args.weight_decay,
        apply_decay_param_fun=lambda x: x in decay_params)

    logger.info("Total steps: %s" % num_training_steps)
    logger.info("WarmUp steps: %s" % warmup)

    metric = SequenceAccuracy()

    total_loss = 0
    global_step = 0

    for epoch in range(1, args.num_train_epochs + 1):
        logger.info(f"Epoch {epoch} beginnig")
        start_time = time.time()

        for total_step, batch in enumerate(train_data_loader):
            global_step += 1
            input_ids, token_type_ids, seq_len, tags = batch

            loss, _ = model(input_ids,
                            token_type_ids,
                            lengths=seq_len,
                            tag_labels=tags)
            loss = loss.mean()
            total_loss += loss
            loss.backward()

            optimizer.step()
            optimizer.clear_grad()
            lr_scheduler.step()

            if global_step % args.logging_steps == 0 and rank == 0:
                end_time = time.time()
                speed = float(args.logging_steps) / (end_time - start_time)
                logger.info(
                    "global step %d, epoch: %d, loss: %.5f, speed: %.2f step/s"
                    % (global_step, epoch, total_loss / args.logging_steps,
                       speed))
                start_time = time.time()
                total_loss = 0

            if (global_step % args.save_steps == 0
                    or global_step == num_training_steps) and rank == 0:
                output_dir = os.path.join(args.output_dir,
                                          "model_%d" % (global_step))
                if not os.path.exists(output_dir):
                    os.makedirs(output_dir)
                model_to_save = model._layers if isinstance(
                    model, paddle.DataParallel) else model
                model_to_save.save_pretrained(output_dir)
                tokenizer.save_pretrained(output_dir)

        evaluate(model, metric, dev_data_loader, tags, tags_to_idx)
예제 #9
0
 def distributed_data_parallel(self):
     strategy = paddle.distributed.prepare_context()
     for net_name, net in self.model.nets.items():
         self.model.nets[net_name] = paddle.DataParallel(net, strategy)
예제 #10
0
def do_train(args):
    assert args.device in [
        "cpu", "gpu", "xpu"
    ], "Invalid device! Available device should be cpu, gpu, or xpu."

    # Initialization for the parallel enviroment
    paddle.set_device(args.device)
    if paddle.distributed.get_world_size() > 1:
        paddle.distributed.init_parallel_env()

    worker_index = paddle.distributed.get_rank()
    worker_num = paddle.distributed.get_world_size()

    # Set the random seed for the training process
    set_seed(args)
    worker_init = WorkerInitObj(args.seed + worker_index)

    # Get the model class and tokenizer class
    args.model_type = args.model_type.lower()
    model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
    tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path)

    # Define the pretrain model and metric
    pretrained_models_list = list(
        model_class.pretrained_init_configuration.keys())
    if args.model_name_or_path in pretrained_models_list:
        model = BigBirdForPretraining(
            BigBirdModel(**model_class.pretrained_init_configuration[
                args.model_name_or_path]))
    else:
        model = BigBirdForPretraining.from_pretrained(args.model_name_or_path)
    # Get bigbird config for generate random attention mask
    global config
    config = getattr(model, BigBirdForPretraining.base_model_prefix).config
    criterion = BigBirdPretrainingCriterion(config["vocab_size"], args.use_nsp)
    if worker_num > 1:
        model = paddle.DataParallel(model)

    # Define learing_rate scheduler and optimizer
    lr_scheduler = LinearDecayWithWarmup(args.learning_rate, args.max_steps,
                                         args.warmup_steps)

    # Generate parameter names needed to perform weight decay.
    # All bias and LayerNorm parameters are excluded.
    decay_params = [
        p.name for n, p in model.named_parameters()
        if not any(nd in n for nd in ["bias", "norm"])
    ]
    optimizer = paddle.optimizer.AdamW(
        learning_rate=lr_scheduler,
        epsilon=args.adam_epsilon,
        parameters=model.parameters(),
        weight_decay=args.weight_decay,
        apply_decay_param_fun=lambda x: x in decay_params)

    global_step = 0
    tic_train = time.time()
    for epoch in range(args.epochs):
        files = [
            os.path.join(args.input_dir, f) for f in os.listdir(args.input_dir)
        ]
        files.sort()
        num_files = len(files)
        for f_id in range(num_files):
            train_data_loader = create_dataloader(files[f_id], tokenizer,
                                                  worker_init, args.batch_size,
                                                  args.max_encoder_length,
                                                  args.max_pred_length)
            for step, batch in enumerate(train_data_loader):
                global_step += 1
                (input_ids, segment_ids, masked_lm_positions, masked_lm_ids,
                 masked_lm_weights, next_sentence_labels,
                 masked_lm_scale) = batch[:7]
                rand_mask_idx_list = batch[7:]

                prediction_scores, seq_relationship_score = model(
                    input_ids=input_ids,
                    token_type_ids=segment_ids,
                    rand_mask_idx_list=rand_mask_idx_list,
                    masked_positions=masked_lm_positions)
                loss = criterion(prediction_scores, seq_relationship_score,
                                 masked_lm_ids, next_sentence_labels,
                                 masked_lm_scale, masked_lm_weights)
                if global_step % args.logging_steps == 0 and worker_index == 0:
                    logger.info(
                        "global step %d, epoch: %d, lr: %.10f, loss: %f, speed: %.2f step/s"
                        % (global_step, epoch, optimizer.get_lr(), loss,
                           args.logging_steps / (time.time() - tic_train)))
                    tic_train = time.time()
                loss.backward()
                optimizer.step()
                lr_scheduler.step()
                optimizer.clear_grad()
                if global_step % args.save_steps == 0:
                    if worker_index == 0:
                        output_dir = os.path.join(args.output_dir,
                                                  "model_%d" % global_step)
                        if not os.path.exists(output_dir):
                            os.makedirs(output_dir)
                        # Need better way to get inner model of DataParallel
                        model_to_save = model._layers if isinstance(
                            model, paddle.DataParallel) else model
                        model_to_save.save_pretrained(output_dir)
                        tokenizer.save_pretrained(output_dir)
                        paddle.save(
                            optimizer.state_dict(),
                            os.path.join(output_dir, "model_state.pdopt"))
                if global_step >= args.max_steps:
                    del train_data_loader
                    return
            del train_data_loader
예제 #11
0
def do_train():
    paddle.set_device(args.device)
    rank = paddle.distributed.get_rank()
    if paddle.distributed.get_world_size() > 1:
        paddle.distributed.init_parallel_env()

    set_seed(args.seed)

    # If you wanna use bert/roberta pretrained model,
    # pretrained_model = ppnlp.transformers.BertModel.from_pretrained('bert-base-chinese')
    # pretrained_model = ppnlp.transformers.RobertaModel.from_pretrained('roberta-wwm-ext')
    pretrained_model = ppnlp.transformers.ErnieModel.from_pretrained(
        'ernie-1.0')

    latest_checkpoint, latest_global_step = get_latest_checkpoint(args)
    logger.info("get latest_checkpoint:{}".format(latest_checkpoint))

    model = SemanticIndexANCE(pretrained_model,
                              margin=args.margin,
                              output_emb_size=args.output_emb_size)

    if latest_checkpoint:
        state_dict = paddle.load(latest_checkpoint)
        model.set_dict(state_dict)
        print("warmup from:{}".format(latest_checkpoint))

    model = paddle.DataParallel(model)

    # If you wanna use bert/roberta pretrained model,
    # tokenizer = ppnlp.transformers.BertTokenizer.from_pretrained('bert-base-chinese')
    # tokenizer = ppnlp.transformers.RobertaTokenizer.from_pretrained('roberta-wwm-ext')

    tokenizer = ppnlp.transformers.ErnieTokenizer.from_pretrained('ernie-1.0')

    trans_func = partial(convert_example,
                         tokenizer=tokenizer,
                         max_seq_length=args.max_seq_length)

    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=tokenizer.pad_token_id),  # text_input
        Pad(axis=0, pad_val=tokenizer.pad_token_type_id),  # text_segment
        Pad(axis=0, pad_val=tokenizer.pad_token_id),  # pos_sample_input
        Pad(axis=0, pad_val=tokenizer.pad_token_type_id),  # pos_sample_segment
        Pad(axis=0, pad_val=tokenizer.pad_token_id),  # neg_sample_input
        Pad(axis=0, pad_val=tokenizer.pad_token_type_id),  # neg_sample_segment
    ): [data for data in fn(samples)]

    global_step = 0

    while global_step < args.max_training_steps:
        latest_ann_data, latest_ann_data_step = get_latest_ann_data(
            args.ann_data_dir)

        if latest_ann_data_step == -1:
            # No ann_data generated yet
            latest_ann_data = args.train_set_file
            logger.info(
                "No ann_data generated yet, Use training_set:{}".format(
                    args.train_set_file))
        else:
            # Using ann_data to training model
            logger.info("Latest ann_data is ready for training: [{}]".format(
                latest_ann_data))

        train_ds = load_dataset(read_text_triplet,
                                data_path=latest_ann_data,
                                lazy=False)

        train_data_loader = create_dataloader(train_ds,
                                              mode='train',
                                              batch_size=args.batch_size,
                                              batchify_fn=batchify_fn,
                                              trans_fn=trans_func)

        num_training_steps = len(train_data_loader) * args.epochs

        lr_scheduler = LinearDecayWithWarmup(args.learning_rate,
                                             num_training_steps,
                                             args.warmup_proportion)

        # Generate parameter names needed to perform weight decay.
        # All bias and LayerNorm parameters are excluded.
        decay_params = [
            p.name for n, p in model.named_parameters()
            if not any(nd in n for nd in ["bias", "norm"])
        ]

        clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0)

        optimizer = paddle.optimizer.AdamW(
            learning_rate=lr_scheduler,
            parameters=model.parameters(),
            weight_decay=args.weight_decay,
            apply_decay_param_fun=lambda x: x in decay_params,
            grad_clip=clip)

        tic_train = time.time()
        for epoch in range(1, args.epochs + 1):
            for step, batch in enumerate(train_data_loader, start=1):
                text_input_ids, text_token_type_ids, pos_sample_input_ids, pos_sample_token_type_ids, neg_sample_input_ids, neg_sample_token_type_ids, = batch

                loss = model(
                    text_input_ids=text_input_ids,
                    pos_sample_input_ids=pos_sample_input_ids,
                    neg_sample_input_ids=neg_sample_input_ids,
                    text_token_type_ids=text_token_type_ids,
                    pos_sample_token_type_ids=pos_sample_token_type_ids,
                    neg_sample_token_type_ids=neg_sample_token_type_ids)

                global_step += 1
                if global_step % 10 == 0 and rank == 0:
                    print(
                        "global step %d, epoch: %d, batch: %d, loss: %.5f, speed: %.2f step/s, trainning_file: %s"
                        % (global_step, epoch, step, loss, 10 /
                           (time.time() - tic_train), latest_ann_data))
                    tic_train = time.time()
                loss.backward()
                optimizer.step()
                lr_scheduler.step()
                optimizer.clear_grad()
                if global_step % args.save_steps == 0 and rank == 0:
                    save_dir = os.path.join(args.save_dir, str(global_step))
                    if not os.path.exists(save_dir):
                        os.makedirs(save_dir)
                    save_param_path = os.path.join(save_dir,
                                                   'model_state.pdparams')
                    paddle.save(model.state_dict(), save_param_path)
                    tokenizer.save_pretrained(save_dir)

                    # Flag to indicate succeefully save model
                    succeed_flag_file = os.path.join(save_dir,
                                                     "succeed_flag_file")
                    open(succeed_flag_file, 'a').close()
예제 #12
0
def train(args):
    paddle.set_device(args.device)
    n_procs = dist.get_world_size()
    rank = dist.get_rank()

    if n_procs > 1:
        dist.init_parallel_env()

    vocab = load_vocab(args.vocab_file, args.max_characters_per_token)

    elmo = ELMo(args.batch_size,
                args.char_embed_dim,
                args.projection_dim,
                vocab.size,
                dropout=args.dropout,
                num_layers=args.num_layers,
                num_highways=args.num_highways,
                char_vocab_size=vocab.char_size)
    if n_procs > 1:
        elmo = paddle.DataParallel(elmo)
    elmo.train()

    gloabl_norm_clip = nn.ClipGradByGlobalNorm(args.max_grad_norm)
    optimizer = paddle.optimizer.Adagrad(learning_rate=args.lr,
                                         parameters=elmo.parameters(),
                                         initial_accumulator_value=1.0,
                                         grad_clip=gloabl_norm_clip)
    elmo_loss = ELMoLoss()

    # Loads pre-trained parameters.
    if args.init_from_ckpt:
        weight_state_dict = paddle.load(args.init_from_ckpt + '.pdparams')
        opt_state_dict = paddle.load(args.init_from_ckpt + '.pdopt')
        elmo.set_state_dict(weight_state_dict)
        optimizer.set_state_dict(opt_state_dict)
        print("Loaded checkpoint from %s" % args.init_from_ckpt)

    train_dataset = OneBillionWordDataset(args.train_data_path,
                                          vocab,
                                          args.batch_size,
                                          args.unroll_steps,
                                          n_procs=n_procs,
                                          rank=rank,
                                          mode='train',
                                          shuffle=True,
                                          seed=args.seed)

    train_dataloader = DataLoader(train_dataset,
                                  return_list=True,
                                  batch_size=None)

    n_tokens_per_batch = args.batch_size * args.unroll_steps * n_procs
    n_steps_per_epoch = int(train_dataset.number_of_tokens /
                            n_tokens_per_batch)
    n_steps_total = args.epochs * n_steps_per_epoch
    print("Training for %s epochs and %s steps" % (args.epochs, n_steps_total))

    total_time = 0.0
    batch_start_time = time.time()
    for step, inputs in enumerate(train_dataloader, start=1):
        ids, next_ids, ids_reverse, next_ids_reverse = inputs
        outputs = elmo([ids, ids_reverse])
        loss = elmo_loss(outputs, [next_ids, next_ids_reverse])
        ppl = paddle.exp(loss)
        loss *= args.unroll_steps
        loss.backward()
        optimizer.step()
        optimizer.clear_grad()

        total_time += (time.time() - batch_start_time)
        if step % args.log_freq == 0:
            print("step %d/%d - loss: %.4f - Perplexity: %.4f - %.3fs/step" %
                  (step, n_steps_total, loss.numpy()[0], ppl.numpy()[0],
                   total_time / args.log_freq))
            total_time = 0.0
        if rank == 0 and step % args.save_freq == 0:
            save_params(elmo, optimizer, args.save_dir, step)
        if step == n_steps_total:
            # training done
            if rank == 0:
                save_params(elmo, optimizer, args.save_dir, 'final')
            break
        batch_start_time = time.time()
예제 #13
0
def do_train(args):
    paddle.set_device("gpu" if args.n_gpu else "cpu")
    if paddle.distributed.get_world_size() > 1:
        paddle.distributed.init_parallel_env()

    set_seed(args)

    args.task_name = args.task_name.lower()
    dataset_class, metric_class = TASK_CLASSES[args.task_name]
    args.model_type = args.model_type.lower()
    model_class, tokenizer_class = MODEL_CLASSES[args.model_type]

    train_dataset = dataset_class.get_datasets(["train"])
    tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path)

    trans_func = partial(convert_example,
                         tokenizer=tokenizer,
                         label_list=train_dataset.get_labels(),
                         max_seq_length=args.max_seq_length)
    train_dataset = train_dataset.apply(trans_func, lazy=True)
    train_batch_sampler = paddle.io.DistributedBatchSampler(
        train_dataset, batch_size=args.batch_size, shuffle=True)
    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=tokenizer.pad_token_id),  # input
        Pad(axis=0, pad_val=tokenizer.pad_token_id),  # segment
        Stack(),  # length
        Stack(dtype="int64"
              if train_dataset.get_labels() else "float32")  # label
    ): [data for i, data in enumerate(fn(samples)) if i != 2]
    train_data_loader = DataLoader(dataset=train_dataset,
                                   batch_sampler=train_batch_sampler,
                                   collate_fn=batchify_fn,
                                   num_workers=0,
                                   return_list=True)
    if args.task_name == "mnli":
        dev_dataset_matched, dev_dataset_mismatched = dataset_class.get_datasets(
            ["dev_matched", "dev_mismatched"])
        dev_dataset_matched = dev_dataset_matched.apply(trans_func, lazy=True)
        dev_dataset_mismatched = dev_dataset_mismatched.apply(trans_func,
                                                              lazy=True)
        dev_batch_sampler_matched = paddle.io.BatchSampler(
            dev_dataset_matched, batch_size=args.batch_size, shuffle=False)
        dev_data_loader_matched = DataLoader(
            dataset=dev_dataset_matched,
            batch_sampler=dev_batch_sampler_matched,
            collate_fn=batchify_fn,
            num_workers=0,
            return_list=True)
        dev_batch_sampler_mismatched = paddle.io.BatchSampler(
            dev_dataset_mismatched, batch_size=args.batch_size, shuffle=False)
        dev_data_loader_mismatched = DataLoader(
            dataset=dev_dataset_mismatched,
            batch_sampler=dev_batch_sampler_mismatched,
            collate_fn=batchify_fn,
            num_workers=0,
            return_list=True)
    else:
        dev_dataset = dataset_class.get_datasets(["dev"])
        dev_dataset = dev_dataset.apply(trans_func, lazy=True)
        dev_batch_sampler = paddle.io.BatchSampler(dev_dataset,
                                                   batch_size=args.batch_size,
                                                   shuffle=False)
        dev_data_loader = DataLoader(dataset=dev_dataset,
                                     batch_sampler=dev_batch_sampler,
                                     collate_fn=batchify_fn,
                                     num_workers=0,
                                     return_list=True)

    num_classes = 1 if train_dataset.get_labels() == None else len(
        train_dataset.get_labels())
    model = model_class.from_pretrained(args.model_name_or_path,
                                        num_classes=num_classes)
    if paddle.distributed.get_world_size() > 1:
        model = paddle.DataParallel(model)

    num_training_steps = args.max_steps if args.max_steps > 0 else (
        len(train_data_loader) * args.num_train_epochs)
    warmup_steps = args.warmup_steps if args.warmup_steps > 0 else (int(
        math.floor(num_training_steps * args.warmup_proportion)))
    lr_scheduler = paddle.optimizer.lr.LambdaDecay(
        args.learning_rate,
        lambda current_step, num_warmup_steps=warmup_steps, num_training_steps=
        num_training_steps: float(current_step) / float(
            max(1, num_warmup_steps))
        if current_step < num_warmup_steps else max(
            0.0,
            float(num_training_steps - current_step) / float(
                max(1, num_training_steps - num_warmup_steps))))

    optimizer = paddle.optimizer.AdamW(
        learning_rate=lr_scheduler,
        beta1=0.9,
        beta2=0.999,
        epsilon=args.adam_epsilon,
        parameters=model.parameters(),
        weight_decay=args.weight_decay,
        apply_decay_param_fun=lambda x: x in [
            p.name for n, p in model.named_parameters()
            if not any(nd in n for nd in ["bias", "norm"])
        ])

    loss_fct = paddle.nn.loss.CrossEntropyLoss() if train_dataset.get_labels(
    ) else paddle.nn.loss.MSELoss()

    metric = metric_class()

    global_step = 0
    tic_train = time.time()
    for epoch in range(args.num_train_epochs):
        for step, batch in enumerate(train_data_loader):
            global_step += 1
            input_ids, segment_ids, labels = batch
            logits = model(input_ids, segment_ids)
            loss = loss_fct(logits, labels)
            loss.backward()
            optimizer.step()
            lr_scheduler.step()
            optimizer.clear_gradients()
            if global_step % args.logging_steps == 0:
                logger.info(
                    "global step %d/%d, epoch: %d, batch: %d, rank_id: %s, loss: %f, lr: %.10f, speed: %.4f step/s"
                    % (global_step, num_training_steps, epoch, step,
                       paddle.distributed.get_rank(), loss, optimizer.get_lr(),
                       args.logging_steps / (time.time() - tic_train)))
                tic_train = time.time()
            if global_step % args.save_steps == 0:
                tic_eval = time.time()
                if args.task_name == "mnli":
                    evaluate(model, loss_fct, metric, dev_data_loader_matched)
                    evaluate(model, loss_fct, metric,
                             dev_data_loader_mismatched)
                    logger.info("eval done total : %s s" %
                                (time.time() - tic_eval))
                else:
                    evaluate(model, loss_fct, metric, dev_data_loader)
                    logger.info("eval done total : %s s" %
                                (time.time() - tic_eval))
                if (not args.n_gpu > 1) or paddle.distributed.get_rank() == 0:
                    output_dir = os.path.join(
                        args.output_dir, "%s_ft_model_%d.pdparams" %
                        (args.task_name, global_step))
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                    # Need better way to get inner model of DataParallel
                    model_to_save = model._layers if isinstance(
                        model, paddle.DataParallel) else model
                    model_to_save.save_pretrained(output_dir)
                    tokenizer.save_pretrained(output_dir)
예제 #14
0
def run(args):
    paddle.set_device("gpu" if args.n_gpu else "cpu")
    if paddle.distributed.get_world_size() > 1:
        paddle.distributed.init_parallel_env()

    args.model_type = args.model_type.lower()
    model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
    tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path)

    set_seed(args)
    if (not args.n_gpu > 1) or paddle.distributed.get_rank() == 0:
        if os.path.exists(args.model_name_or_path):
            print("init checkpoint from %s" % args.model_name_or_path)

    model = model_class.from_pretrained(args.model_name_or_path)

    if paddle.distributed.get_world_size() > 1:
        model = paddle.DataParallel(model)

    def prepare_train_features(examples):
        # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
        # in one example possible giving several features when a context is long, each of those features having a
        # context that overlaps a bit the context of the previous feature.
        #NOTE: Almost the same functionality as HuggingFace's prepare_train_features function. The main difference is
        # that HugggingFace uses ArrowTable as basic data structure, while we use list of dictionary instead.
        contexts = [examples[i]['context'] for i in range(len(examples))]
        questions = [examples[i]['question'] for i in range(len(examples))]

        tokenized_examples = tokenizer(questions,
                                       contexts,
                                       stride=args.doc_stride,
                                       max_seq_len=args.max_seq_length)

        # Let's label those examples!
        for i, tokenized_example in enumerate(tokenized_examples):
            # We will label impossible answers with the index of the CLS token.
            input_ids = tokenized_example["input_ids"]
            cls_index = input_ids.index(tokenizer.cls_token_id)

            # The offset mappings will give us a map from token to character position in the original context. This will
            # help us compute the start_positions and end_positions.
            offsets = tokenized_example['offset_mapping']

            # Grab the sequence corresponding to that example (to know what is the context and what is the question).
            sequence_ids = tokenized_example['token_type_ids']

            # One example can give several spans, this is the index of the example containing this span of text.
            sample_index = tokenized_example['overflow_to_sample']
            answers = examples[sample_index]['answers']
            answer_starts = examples[sample_index]['answer_starts']

            # If no answers are given, set the cls_index as answer.
            if len(answer_starts) == 0:
                tokenized_examples[i]["start_positions"] = cls_index
                tokenized_examples[i]["end_positions"] = cls_index
            else:
                # Start/end character index of the answer in the text.
                start_char = answer_starts[0]
                end_char = start_char + len(answers[0])

                # Start token index of the current span in the text.
                token_start_index = 0
                while sequence_ids[token_start_index] != 1:
                    token_start_index += 1

                # End token index of the current span in the text.
                token_end_index = len(input_ids) - 1
                while sequence_ids[token_end_index] != 1:
                    token_end_index -= 1
                # Minus one more to reach actual text
                token_end_index -= 1

                # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index).
                if not (offsets[token_start_index][0] <= start_char
                        and offsets[token_end_index][1] >= end_char):
                    tokenized_examples[i]["start_positions"] = cls_index
                    tokenized_examples[i]["end_positions"] = cls_index
                else:
                    # Otherwise move the token_start_index and token_end_index to the two ends of the answer.
                    # Note: we could go after the last offset if the answer is the last word (edge case).
                    while token_start_index < len(offsets) and offsets[
                            token_start_index][0] <= start_char:
                        token_start_index += 1
                    tokenized_examples[i][
                        "start_positions"] = token_start_index - 1
                    while offsets[token_end_index][1] >= end_char:
                        token_end_index -= 1
                    tokenized_examples[i][
                        "end_positions"] = token_end_index + 1

        return tokenized_examples

    if args.do_train:
        if args.train_file:
            train_ds = load_dataset('sqaud', data_files=args.train_file)
        elif args.version_2_with_negative:
            train_ds = load_dataset('squad', splits='train_v2')
        else:
            train_ds = load_dataset('squad', splits='train_v1')
        train_ds.map(prepare_train_features, batched=True)
        train_batch_sampler = paddle.io.DistributedBatchSampler(
            train_ds, batch_size=args.batch_size, shuffle=True)
        train_batchify_fn = lambda samples, fn=Dict(
            {
                "input_ids": Pad(axis=0, pad_val=tokenizer.pad_token_id),
                "token_type_ids": Pad(axis=0,
                                      pad_val=tokenizer.pad_token_type_id),
                "start_positions": Stack(dtype="int64"),
                "end_positions": Stack(dtype="int64")
            }): fn(samples)

        train_data_loader = DataLoader(dataset=train_ds,
                                       batch_sampler=train_batch_sampler,
                                       collate_fn=train_batchify_fn,
                                       return_list=True)

        num_training_steps = args.max_steps if args.max_steps > 0 else len(
            train_data_loader) * args.num_train_epochs
        num_train_epochs = math.ceil(num_training_steps /
                                     len(train_data_loader))

        lr_scheduler = LinearDecayWithWarmup(args.learning_rate,
                                             num_training_steps,
                                             args.warmup_proportion)

        # Generate parameter names needed to perform weight decay.
        # All bias and LayerNorm parameters are excluded.
        decay_params = [
            p.name for n, p in model.named_parameters()
            if not any(nd in n for nd in ["bias", "norm"])
        ]
        optimizer = paddle.optimizer.AdamW(
            learning_rate=lr_scheduler,
            epsilon=args.adam_epsilon,
            parameters=model.parameters(),
            weight_decay=args.weight_decay,
            apply_decay_param_fun=lambda x: x in decay_params)
        criterion = CrossEntropyLossForSQuAD()

        global_step = 0
        tic_train = time.time()
        for epoch in range(num_train_epochs):
            for step, batch in enumerate(train_data_loader):
                global_step += 1
                input_ids, token_type_ids, start_positions, end_positions = batch

                logits = model(input_ids=input_ids,
                               token_type_ids=token_type_ids)
                loss = criterion(logits, (start_positions, end_positions))

                if global_step % args.logging_steps == 0:
                    print(
                        "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s"
                        % (global_step, epoch + 1, step + 1, loss,
                           args.logging_steps / (time.time() - tic_train)))
                    tic_train = time.time()
                loss.backward()
                optimizer.step()
                lr_scheduler.step()
                optimizer.clear_grad()

                if global_step % args.save_steps == 0 or global_step == num_training_steps:
                    if (not args.n_gpu > 1
                        ) or paddle.distributed.get_rank() == 0:
                        output_dir = os.path.join(args.output_dir,
                                                  "model_%d" % global_step)
                        if not os.path.exists(output_dir):
                            os.makedirs(output_dir)
                        # need better way to get inner model of DataParallel
                        model_to_save = model._layers if isinstance(
                            model, paddle.DataParallel) else model
                        model_to_save.save_pretrained(output_dir)
                        tokenizer.save_pretrained(output_dir)
                        print('Saving checkpoint to:', output_dir)
                    if global_step == num_training_steps:
                        break

    def prepare_validation_features(examples):
        # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
        # in one example possible giving several features when a context is long, each of those features having a
        # context that overlaps a bit the context of the previous feature.
        #NOTE: Almost the same functionality as HuggingFace's prepare_train_features function. The main difference is
        # that HugggingFace uses ArrowTable as basic data structure, while we use list of dictionary instead.
        contexts = [examples[i]['context'] for i in range(len(examples))]
        questions = [examples[i]['question'] for i in range(len(examples))]

        tokenized_examples = tokenizer(questions,
                                       contexts,
                                       stride=args.doc_stride,
                                       max_seq_len=args.max_seq_length)

        # For validation, there is no need to compute start and end positions
        for i, tokenized_example in enumerate(tokenized_examples):
            # Grab the sequence corresponding to that example (to know what is the context and what is the question).
            sequence_ids = tokenized_example['token_type_ids']

            # One example can give several spans, this is the index of the example containing this span of text.
            sample_index = tokenized_example['overflow_to_sample']
            tokenized_examples[i]["example_id"] = examples[sample_index]['id']

            # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token
            # position is part of the context or not.
            tokenized_examples[i]["offset_mapping"] = [
                (o if sequence_ids[k] == 1 else None)
                for k, o in enumerate(tokenized_example["offset_mapping"])
            ]

        return tokenized_examples

    if args.do_predict:
        if args.predict_file:
            dev_ds = load_dataset('sqaud', data_files=args.predict_file)
        elif args.version_2_with_negative:
            dev_ds = load_dataset('squad', splits='dev_v2')
        else:
            dev_ds = load_dataset('squad', splits='dev_v1')

        dev_ds.map(prepare_validation_features, batched=True)
        dev_batch_sampler = paddle.io.BatchSampler(dev_ds,
                                                   batch_size=args.batch_size,
                                                   shuffle=False)

        dev_batchify_fn = lambda samples, fn=Dict({
            "input_ids":
            Pad(axis=0, pad_val=tokenizer.pad_token_id),
            "token_type_ids":
            Pad(axis=0, pad_val=tokenizer.pad_token_type_id)
        }): fn(samples)

        dev_data_loader = DataLoader(dataset=dev_ds,
                                     batch_sampler=dev_batch_sampler,
                                     collate_fn=dev_batchify_fn,
                                     return_list=True)

        if (not args.n_gpu > 1) or paddle.distributed.get_rank() == 0:
            evaluate(model, dev_data_loader, args)
예제 #15
0
    def __init__(self, config, mode="train"):
        assert mode in ["train", "eval", "infer", "export"]
        self.mode = mode
        self.config = config
        self.eval_mode = self.config["Global"].get("eval_mode",
                                                   "classification")
        if "Head" in self.config["Arch"]:
            self.is_rec = True
        else:
            self.is_rec = False

        # set seed
        seed = self.config["Global"].get("seed", False)
        if seed or seed == 0:
            assert isinstance(seed, int), "The 'seed' must be a integer!"
            paddle.seed(seed)
            np.random.seed(seed)
            random.seed(seed)

        # init logger
        self.output_dir = self.config['Global']['output_dir']
        log_file = os.path.join(self.output_dir, self.config["Arch"]["name"],
                                f"{mode}.log")
        init_logger(name='root', log_file=log_file)
        print_config(config)

        # init train_func and eval_func
        assert self.eval_mode in ["classification", "retrieval"], logger.error(
            "Invalid eval mode: {}".format(self.eval_mode))
        self.train_epoch_func = train_epoch
        self.eval_func = getattr(evaluation, self.eval_mode + "_eval")

        self.use_dali = self.config['Global'].get("use_dali", False)

        # for visualdl
        self.vdl_writer = None
        if self.config['Global']['use_visualdl'] and mode == "train":
            vdl_writer_path = os.path.join(self.output_dir, "vdl")
            if not os.path.exists(vdl_writer_path):
                os.makedirs(vdl_writer_path)
            self.vdl_writer = LogWriter(logdir=vdl_writer_path)

        # set device
        assert self.config["Global"]["device"] in ["cpu", "gpu", "xpu", "npu"]
        self.device = paddle.set_device(self.config["Global"]["device"])
        logger.info('train with paddle {} and device {}'.format(
            paddle.__version__, self.device))

        # AMP training
        self.amp = True if "AMP" in self.config else False
        if self.amp and self.config["AMP"] is not None:
            self.scale_loss = self.config["AMP"].get("scale_loss", 1.0)
            self.use_dynamic_loss_scaling = self.config["AMP"].get(
                "use_dynamic_loss_scaling", False)
        else:
            self.scale_loss = 1.0
            self.use_dynamic_loss_scaling = False
        if self.amp:
            AMP_RELATED_FLAGS_SETTING = {
                'FLAGS_cudnn_batchnorm_spatial_persistent': 1,
                'FLAGS_max_inplace_grad_add': 8,
            }
            paddle.fluid.set_flags(AMP_RELATED_FLAGS_SETTING)

        #TODO(gaotingquan): support rec
        class_num = config["Arch"].get("class_num", None)
        self.config["DataLoader"].update({"class_num": class_num})
        # build dataloader
        if self.mode == 'train':
            self.train_dataloader = build_dataloader(self.config["DataLoader"],
                                                     "Train", self.device,
                                                     self.use_dali)
        if self.mode == "eval" or (self.mode == "train" and
                                   self.config["Global"]["eval_during_train"]):
            if self.eval_mode == "classification":
                self.eval_dataloader = build_dataloader(
                    self.config["DataLoader"], "Eval", self.device,
                    self.use_dali)
            elif self.eval_mode == "retrieval":
                self.gallery_query_dataloader = None
                if len(self.config["DataLoader"]["Eval"].keys()) == 1:
                    key = list(self.config["DataLoader"]["Eval"].keys())[0]
                    self.gallery_query_dataloader = build_dataloader(
                        self.config["DataLoader"]["Eval"], key, self.device,
                        self.use_dali)
                else:
                    self.gallery_dataloader = build_dataloader(
                        self.config["DataLoader"]["Eval"], "Gallery",
                        self.device, self.use_dali)
                    self.query_dataloader = build_dataloader(
                        self.config["DataLoader"]["Eval"], "Query",
                        self.device, self.use_dali)

        # build loss
        if self.mode == "train":
            loss_info = self.config["Loss"]["Train"]
            self.train_loss_func = build_loss(loss_info)
        if self.mode == "eval" or (self.mode == "train" and
                                   self.config["Global"]["eval_during_train"]):
            loss_config = self.config.get("Loss", None)
            if loss_config is not None:
                loss_config = loss_config.get("Eval")
                if loss_config is not None:
                    self.eval_loss_func = build_loss(loss_config)
                else:
                    self.eval_loss_func = None
            else:
                self.eval_loss_func = None

        # build metric
        if self.mode == 'train':
            metric_config = self.config.get("Metric")
            if metric_config is not None:
                metric_config = metric_config.get("Train")
                if metric_config is not None:
                    self.train_metric_func = build_metrics(metric_config)
                else:
                    self.train_metric_func = None
        else:
            self.train_metric_func = None

        if self.mode == "eval" or (self.mode == "train" and
                                   self.config["Global"]["eval_during_train"]):
            metric_config = self.config.get("Metric")
            if self.eval_mode == "classification":
                if metric_config is not None:
                    metric_config = metric_config.get("Eval")
                    if metric_config is not None:
                        self.eval_metric_func = build_metrics(metric_config)
            elif self.eval_mode == "retrieval":
                if metric_config is None:
                    metric_config = [{"name": "Recallk", "topk": (1, 5)}]
                else:
                    metric_config = metric_config["Eval"]
                self.eval_metric_func = build_metrics(metric_config)
        else:
            self.eval_metric_func = None

        # build model
        self.model = build_model(self.config["Arch"])
        # set @to_static for benchmark, skip this by default.
        apply_to_static(self.config, self.model)

        # for slim
        self.pruner = get_pruner(self.config, self.model)
        self.quanter = get_quaner(self.config, self.model)

        # load_pretrain
        if self.config["Global"]["pretrained_model"] is not None:
            if self.config["Global"]["pretrained_model"].startswith("http"):
                load_dygraph_pretrain_from_url(
                    self.model, self.config["Global"]["pretrained_model"])
            else:
                load_dygraph_pretrain(
                    self.model, self.config["Global"]["pretrained_model"])

        # build optimizer
        if self.mode == 'train':
            self.optimizer, self.lr_sch = build_optimizer(
                self.config["Optimizer"], self.config["Global"]["epochs"],
                len(self.train_dataloader), [self.model])

        # for distributed
        self.config["Global"][
            "distributed"] = paddle.distributed.get_world_size() != 1
        if self.config["Global"]["distributed"]:
            dist.init_parallel_env()
        if self.config["Global"]["distributed"]:
            self.model = paddle.DataParallel(self.model)

        # build postprocess for infer
        if self.mode == 'infer':
            self.preprocess_func = create_operators(
                self.config["Infer"]["transforms"])
            self.postprocess_func = build_postprocess(
                self.config["Infer"]["PostProcess"])
예제 #16
0
def do_train():
    set_seed(args)
    paddle.set_device("gpu" if args.n_gpu else "cpu")

    world_size = paddle.distributed.get_world_size()
    if world_size > 1:
        paddle.distributed.init_parallel_env()

    no_entity_label = "O"
    ignore_label = -1

    tokenizer = ErnieTokenizer.from_pretrained("ernie-1.0")
    label_map = load_dict(args.tag_path)
    id2label = {val: key for key, val in label_map.items()}
    model = ErnieForTokenClassification.from_pretrained(
        "ernie-1.0", num_classes=len(label_map))
    model = paddle.DataParallel(model)

    print("============start train==========")
    train_ds = DuEventExtraction(args.train_data, args.vocab_path,
                                 args.tag_path)
    dev_ds = DuEventExtraction(args.dev_data, args.vocab_path, args.tag_path)
    test_ds = DuEventExtraction(args.test_data, args.vocab_path, args.tag_path)

    trans_func = partial(convert_example_to_feature,
                         tokenizer=tokenizer,
                         label_vocab=train_ds.label_vocab,
                         max_seq_len=args.max_seq_len,
                         no_entity_label=no_entity_label,
                         ignore_label=ignore_label,
                         is_test=False)
    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token]),  # input ids
        Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token]
            ),  # token type ids
        Stack(),  # sequence lens
        Pad(axis=0, pad_val=ignore_label)  # labels
    ): fn(list(map(trans_func, samples)))

    batch_sampler = paddle.io.DistributedBatchSampler(
        train_ds, batch_size=args.batch_size, shuffle=True)
    train_loader = paddle.io.DataLoader(dataset=train_ds,
                                        batch_sampler=batch_sampler,
                                        collate_fn=batchify_fn)
    dev_loader = paddle.io.DataLoader(dataset=dev_ds,
                                      batch_size=args.batch_size,
                                      collate_fn=batchify_fn)
    test_loader = paddle.io.DataLoader(dataset=test_ds,
                                       batch_size=args.batch_size,
                                       collate_fn=batchify_fn)

    num_training_steps = len(train_loader) * args.num_epoch
    optimizer = paddle.optimizer.AdamW(learning_rate=args.learning_rate,
                                       parameters=model.parameters())

    metric = ChunkEvaluator(label_list=train_ds.label_vocab.keys(),
                            suffix=True)
    criterion = paddle.nn.loss.CrossEntropyLoss(ignore_index=ignore_label)

    step, best_f1 = 0, 0.0
    model.train()
    for epoch in range(args.num_epoch):
        for idx, (input_ids, token_type_ids, seq_lens,
                  labels) in enumerate(train_loader):
            logits = model(input_ids,
                           token_type_ids).reshape([-1, train_ds.label_num])
            loss = paddle.mean(criterion(logits, labels.reshape([-1])))
            loss.backward()
            optimizer.step()
            optimizer.clear_grad()
            loss_item = loss.numpy().item()
            if step > 0 and step % args.skip_step == 0 and paddle.distributed.get_rank(
            ) == 0:
                print(
                    f'train epoch: {epoch} - step: {step} (total: {num_training_steps}) - loss: {loss_item:.6f}'
                )
            if step > 0 and step % args.valid_step == 0 and paddle.distributed.get_rank(
            ) == 0:
                p, r, f1, avg_loss = evaluate(model, criterion, metric,
                                              len(label_map), dev_loader)
                print(f'dev step: {step} - loss: {avg_loss:.5f}, precision: {p:.5f}, recall: {r:.5f}, ' \
                        f'f1: {f1:.5f} current best {best_f1:.5f}')
                if f1 > best_f1:
                    best_f1 = f1
                    print(f'==============================================save best model ' \
                            f'best performerence {best_f1:5f}')
                    paddle.save(model.state_dict(),
                                '{}/best.pdparams'.format(args.checkpoints))
            step += 1

    # save the final model
    if paddle.distributed.get_rank() == 0:
        paddle.save(model.state_dict(),
                    '{}/final.pdparams'.format(args.checkpoints))
예제 #17
0
def run(args):
    paddle.set_device(args.device)
    set_seed(args)

    max_seq_length = args.max_seq_length
    max_num_choices = 10

    def preprocess_function(examples, do_predict=False):
        SPIECE_UNDERLINE = '▁'

        def _is_chinese_char(cp):
            if ((cp >= 0x4E00 and cp <= 0x9FFF) or  #
                (cp >= 0x3400 and cp <= 0x4DBF) or  #
                (cp >= 0x20000 and cp <= 0x2A6DF) or  #
                (cp >= 0x2A700 and cp <= 0x2B73F) or  #
                (cp >= 0x2B740 and cp <= 0x2B81F) or  #
                (cp >= 0x2B820 and cp <= 0x2CEAF) or
                (cp >= 0xF900 and cp <= 0xFAFF) or  #
                (cp >= 0x2F800 and cp <= 0x2FA1F)):  #
                return True

            return False

        def is_fuhao(c):
            if c == '。' or c == ',' or c == '!' or c == '?' or c == ';' or c == '、' or c == ':' or c == '(' or c == ')' \
                    or c == '-' or c == '~' or c == '「' or c == '《' or c == '》' or c == ',' or c == '」' or c == '"' or c == '“' or c == '”' \
                    or c == '$' or c == '『' or c == '』' or c == '—' or c == ';' or c == '。' or c == '(' or c == ')' or c == '-' or c == '~' or c == '。' \
                    or c == '‘' or c == '’':
                return True
            return False

        def _tokenize_chinese_chars(text):
            """Adds whitespace around any CJK character."""
            output = []
            is_blank = False
            for index, char in enumerate(text):
                cp = ord(char)
                if is_blank:
                    output.append(char)
                    if context[index - 12:index + 1].startswith("#idiom"):
                        is_blank = False
                        output.append(SPIECE_UNDERLINE)
                else:
                    if text[index:index + 6] == "#idiom":
                        is_blank = True
                        if len(output) > 0 and output[-1] != SPIECE_UNDERLINE:
                            output.append(SPIECE_UNDERLINE)
                        output.append(char)
                    elif _is_chinese_char(cp) or is_fuhao(char):
                        if len(output) > 0 and output[-1] != SPIECE_UNDERLINE:
                            output.append(SPIECE_UNDERLINE)
                        output.append(char)
                        output.append(SPIECE_UNDERLINE)
                    else:
                        output.append(char)
            return "".join(output)

        def is_whitespace(c):
            if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(
                    c) == 0x202F or c == SPIECE_UNDERLINE:
                return True
            return False

        def add_tokens_for_around(tokens, pos, num_tokens):
            num_l = num_tokens // 2
            num_r = num_tokens - num_l

            if pos >= num_l and (len(tokens) - 1 - pos) >= num_r:
                tokens_l = tokens[pos - num_l:pos]
                tokens_r = tokens[pos + 1:pos + 1 + num_r]
            elif pos <= num_l:
                tokens_l = tokens[:pos]
                right_len = num_tokens - len(tokens_l)
                tokens_r = tokens[pos + 1:pos + 1 + right_len]
            elif (len(tokens) - 1 - pos) <= num_r:
                tokens_r = tokens[pos + 1:]
                left_len = num_tokens - len(tokens_r)
                tokens_l = tokens[pos - left_len:pos]
            else:
                raise ValueError('impossible')

            return tokens_l, tokens_r

        max_tokens_for_doc = max_seq_length - 3
        num_tokens = max_tokens_for_doc - 5
        num_examples = len(examples.data["candidates"])
        if do_predict:
            result = {"input_ids": [], "token_type_ids": []}
        else:
            result = {"input_ids": [], "token_type_ids": [], "labels": []}
        for idx in range(num_examples):
            candidate = 0
            options = examples.data['candidates'][idx]

            # Each content may have several sentences.
            for context in examples.data['content'][idx]:
                context = context.replace("“", "\"").replace("”", "\"").replace("——", "--"). \
                    replace("—", "-").replace("―", "-").replace("…", "...").replace("‘", "\'").replace("’", "\'")
                context = _tokenize_chinese_chars(context)
                paragraph_text = context.strip()
                doc_tokens = []
                prev_is_whitespace = True
                for c in paragraph_text:
                    if is_whitespace(c):
                        prev_is_whitespace = True
                    else:
                        if prev_is_whitespace:
                            doc_tokens.append(c)
                        else:
                            doc_tokens[-1] += c
                        prev_is_whitespace = False
                all_doc_tokens = []
                for (i, token) in enumerate(doc_tokens):
                    if '#idiom' in token:
                        sub_tokens = [str(token)]
                    else:
                        sub_tokens = tokenizer.tokenize(token)
                    for sub_token in sub_tokens:
                        all_doc_tokens.append(sub_token)
                tags = [blank for blank in doc_tokens if '#idiom' in blank]

                # Each sentence may have several tags
                for tag_index, tag in enumerate(tags):
                    pos = all_doc_tokens.index(tag)

                    tmp_l, tmp_r = add_tokens_for_around(all_doc_tokens, pos,
                                                         num_tokens)
                    num_l = len(tmp_l)
                    num_r = len(tmp_r)
                    tokens_l = []
                    for token in tmp_l:
                        if '#idiom' in token and token != tag:
                            # Mask tag which is not considered in this new sample.
                            # Each idiom has four words, so 4 mask tokens are used.
                            tokens_l.extend(['[MASK]'] * 4)
                        else:
                            tokens_l.append(token)
                    tokens_l = tokens_l[-num_l:]
                    del tmp_l

                    tokens_r = []
                    for token in tmp_r:
                        if '#idiom' in token and token != tag:
                            tokens_r.extend(['[MASK]'] * 4)
                        else:
                            tokens_r.append(token)
                    tokens_r = tokens_r[:num_r]
                    del tmp_r

                    tokens_list = []
                    # Each tag has ten choices, and the shape of each new
                    # example is [num_choices, seq_len]
                    for i, elem in enumerate(options):
                        option = tokenizer.tokenize(elem)
                        tokens = option + ['[SEP]'] + tokens_l + ['[unused1]'
                                                                  ] + tokens_r
                        tokens_list.append(tokens)
                    new_data = tokenizer(tokens_list, is_split_into_words=True)
                    # Final shape of input_ids: [batch_size, num_choices, seq_len]
                    result["input_ids"].append(new_data["input_ids"])
                    result["token_type_ids"].append(new_data["token_type_ids"])
                    if not do_predict:
                        label = examples.data["answers"][idx]["candidate_id"][
                            candidate]
                        result["labels"].append(label)
                    candidate += 1
            if (idx + 1) % 10000 == 0:
                print(idx + 1, "samples have been processed.")
        return result

    if paddle.distributed.get_world_size() > 1:
        paddle.distributed.init_parallel_env()

    model = AutoModelForMultipleChoice.from_pretrained(
        args.model_name_or_path, num_choices=max_num_choices)
    tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)

    if paddle.distributed.get_world_size() > 1:
        model = paddle.DataParallel(model)

    train_ds, dev_ds, test_ds = load_dataset(
        "clue", "chid", split=["train", "validation", "test"])

    if args.do_train:
        args.batch_size = int(args.batch_size /
                              args.gradient_accumulation_steps)
        column_names = train_ds.column_names
        train_ds = train_ds.map(partial(preprocess_function),
                                batched=True,
                                batch_size=len(train_ds),
                                num_proc=1,
                                remove_columns=column_names)
        batchify_fn = lambda samples, fn=Dict({
            'input_ids': Pad(axis=1, pad_val=tokenizer.pad_token_id),  # input
            'token_type_ids': Pad(axis=1, pad_val=tokenizer.pad_token_type_id),  # segment
            'labels': Stack(dtype="int64")  # label
        }): fn(samples)

        train_batch_sampler = paddle.io.DistributedBatchSampler(
            train_ds, batch_size=args.batch_size, shuffle=True)
        train_data_loader = paddle.io.DataLoader(
            dataset=train_ds,
            batch_sampler=train_batch_sampler,
            collate_fn=batchify_fn,
            num_workers=0,
            return_list=True)

        dev_ds = dev_ds.map(partial(preprocess_function),
                            batched=True,
                            batch_size=len(dev_ds),
                            remove_columns=column_names,
                            num_proc=1)

        dev_batch_sampler = paddle.io.BatchSampler(
            dev_ds, batch_size=args.eval_batch_size, shuffle=False)

        dev_data_loader = paddle.io.DataLoader(
            dataset=dev_ds,
            batch_sampler=dev_batch_sampler,
            collate_fn=batchify_fn,
            return_list=True)

        num_training_steps = int(
            len(train_data_loader) * args.num_train_epochs /
            args.gradient_accumulation_steps)
        lr_scheduler = LinearDecayWithWarmup(args.learning_rate,
                                             num_training_steps, 0)
        # Generate parameter names needed to perform weight decay.
        # All bias and LayerNorm parameters are excluded.
        decay_params = [
            p.name for n, p in model.named_parameters()
            if not any(nd in n for nd in ["bias", "norm"])
        ]
        grad_clip = paddle.nn.ClipGradByGlobalNorm(args.max_grad_norm)
        optimizer = paddle.optimizer.AdamW(
            learning_rate=lr_scheduler,
            parameters=model.parameters(),
            weight_decay=args.weight_decay,
            apply_decay_param_fun=lambda x: x in decay_params,
            grad_clip=grad_clip)

        loss_fct = nn.CrossEntropyLoss()
        metric = Accuracy()

        model.train()
        global_step = 0
        best_acc = 0.0
        tic_train = time.time()
        for epoch in range(args.num_train_epochs):
            for step, batch in enumerate(train_data_loader):
                input_ids, segment_ids, labels = batch
                logits = model(input_ids=input_ids, token_type_ids=segment_ids)
                loss = loss_fct(logits, labels)
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps
                loss.backward()
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    global_step += 1
                    optimizer.step()
                    lr_scheduler.step()
                    optimizer.clear_grad()
                    if global_step % args.logging_steps == 0:
                        print(
                            "global step %d/%d, epoch: %d, batch: %d, loss: %.5f, speed: %.2f step/s"
                            % (global_step, num_training_steps, epoch, step + 1,
                               loss,
                               args.logging_steps / (time.time() - tic_train)))
                        tic_train = time.time()
            tic_eval = time.time()
            acc = evaluate(model, loss_fct, metric, dev_data_loader)
            print("eval acc: %.5f, eval done total : %s s" %
                  (acc, time.time() - tic_eval))
            if paddle.distributed.get_rank() == 0 and acc > best_acc:
                best_acc = acc
                model_to_save = model._layers if isinstance(
                    model, paddle.DataParallel) else model
                if not os.path.exists(args.output_dir):
                    os.makedirs(args.output_dir)
                model_to_save.save_pretrained(args.output_dir)
                tokenizer.save_pretrained(args.output_dir)
        print("best_acc: ", best_acc)

    if args.do_predict:
        column_names = test_ds.column_names
        test_ds = test_ds.map(partial(
            preprocess_function, do_predict=True),
                              batched=True,
                              batch_size=len(test_ds),
                              remove_columns=column_names,
                              num_proc=1)

        test_batch_sampler = paddle.io.BatchSampler(
            test_ds, batch_size=args.eval_batch_size, shuffle=False)

        batchify_fn = lambda samples, fn=Dict({
            'input_ids': Pad(axis=1, pad_val=tokenizer.pad_token_id),  # input
            'token_type_ids': Pad(axis=1, pad_val=tokenizer.pad_token_type_id),  # segment
        }): fn(samples)

        test_data_loader = paddle.io.DataLoader(
            dataset=test_ds,
            batch_sampler=test_batch_sampler,
            collate_fn=batchify_fn,
            return_list=True)

        result = {}
        idx = 623377
        for step, batch in enumerate(test_data_loader):
            input_ids, segment_ids = batch
            with paddle.no_grad():
                logits = model(input_ids, segment_ids)
            preds = paddle.argmax(logits, axis=1).numpy().tolist()
            for pred in preds:
                result["#idiom" + str(idx)] = pred
                idx += 1

        if not os.path.exists(args.output_dir):
            os.makedirs(args.output_dir)
        with open(
                os.path.join(args.output_dir, 'chid11_predict.json'),
                "w",
                encoding='utf-8') as writer:
            writer.write(
                json.dumps(
                    result, ensure_ascii=False, indent=4) + "\n")
예제 #18
0
def do_train():
    paddle.set_device(args.device)
    rank = paddle.distributed.get_rank()
    if paddle.distributed.get_world_size() > 1:
        paddle.distributed.init_parallel_env()

    set_seed(args.seed)
    if args.language == 'ch':
        train_ds, dev_ds = load_dataset("lcqmc", splits=["train", "dev"])

        if args.base_model == 'roberta_base':
            tokenizer = RobertaTokenizer.from_pretrained('roberta-wwm-ext')
            pretrained_model = RobertaForSequenceClassification.from_pretrained(
                'roberta-wwm-ext', num_classes=2)
        elif args.base_model == 'roberta_large':
            tokenizer = RobertaTokenizer.from_pretrained(
                'roberta-wwm-ext-large')
            pretrained_model = RobertaForSequenceClassification.from_pretrained(
                'roberta-wwm-ext-large', num_classes=2)
    else:
        train_ds, dev_ds = load_dataset("glue", "qqp", splits=["train", "dev"])

        if args.base_model == 'roberta_base':
            tokenizer = RobertaBPETokenizer.from_pretrained('roberta-base')
            pretrained_model = RobertaForSequenceClassification.from_pretrained(
                'roberta-base', num_classes=2)
        elif args.base_model == 'roberta_large':
            tokenizer = RobertaBPETokenizer.from_pretrained('roberta-large')
            pretrained_model = RobertaForSequenceClassification.from_pretrained(
                'roberta-large', num_classes=2)

    trans_func = partial(
        convert_example,
        tokenizer=tokenizer,
        max_seq_length=args.max_seq_length,
        language=args.language)

    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=tokenizer.pad_token_id),  # text_pair_input
        Pad(axis=0, pad_val=tokenizer.pad_token_type_id),  # text_pair_segment
        Stack(dtype="int64")  # label
    ): [data for data in fn(samples)]

    train_data_loader = create_dataloader(
        train_ds,
        mode='train',
        batch_size=args.batch_size,
        batchify_fn=batchify_fn,
        trans_fn=trans_func)

    dev_data_loader = create_dataloader(
        dev_ds,
        mode='dev',
        batch_size=args.batch_size,
        batchify_fn=batchify_fn,
        trans_fn=trans_func)

    model = pretrained_model

    if args.init_from_ckpt and os.path.isfile(args.init_from_ckpt):
        state_dict = paddle.load(args.init_from_ckpt)
        model.set_dict(state_dict)

    model = paddle.DataParallel(model)

    num_training_steps = len(train_data_loader) * args.epochs

    lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps,
                                         args.warmup_proportion)

    # Generate parameter names needed to perform weight decay.
    # All bias and LayerNorm parameters are excluded.
    decay_params = [
        p.name for n, p in model.named_parameters()
        if not any(nd in n for nd in ["bias", "norm"])
    ]
    optimizer = paddle.optimizer.AdamW(
        learning_rate=lr_scheduler,
        parameters=model.parameters(),
        weight_decay=args.weight_decay,
        apply_decay_param_fun=lambda x: x in decay_params)

    criterion = paddle.nn.loss.CrossEntropyLoss()
    metric = paddle.metric.Accuracy()

    global_step = 0
    tic_train = time.time()
    for epoch in range(1, args.epochs + 1):
        for step, batch in enumerate(train_data_loader, start=1):
            input_ids, token_type_ids, labels = batch
            probs = model(input_ids=input_ids, token_type_ids=token_type_ids)
            loss = criterion(probs, labels)
            correct = metric.compute(probs, labels)
            metric.update(correct)
            acc = metric.accumulate()

            global_step += 1
            if global_step % 100 == 0 and rank == 0:
                print(
                    "global step %d, epoch: %d, batch: %d, loss: %.5f, accu: %.5f, speed: %.2f step/s"
                    % (global_step, epoch, step, loss, acc,
                       100 / (time.time() - tic_train)),
                    flush=True)
                tic_train = time.time()
            loss.backward()
            optimizer.step()
            lr_scheduler.step()
            optimizer.clear_grad()

            if global_step % args.eval_step == 0 and rank == 0:
                evaluate(model, criterion, metric, dev_data_loader)

            if global_step % args.save_step == 0 and rank == 0:
                save_dir = os.path.join(args.save_dir, "model_%d" % global_step)
                if not os.path.exists(save_dir):
                    os.makedirs(save_dir)
                save_param_path = os.path.join(save_dir, 'model_state.pdparams')
                paddle.save(model.state_dict(), save_param_path)
                tokenizer.save_pretrained(save_dir)
예제 #19
0
def do_train(args):
    paddle.set_device(args.device)
    set_seed(args.seed)
    if paddle.distributed.get_world_size() > 1:
        paddle.distributed.init_parallel_env()

    # Load model and train from scratch
    # model = RobertaForMaskedLM(
    #     RobertaModel(**RobertaForMaskedLM.pretrained_init_configuration[
    #         args.model_name_or_path]))
    model = RobertaForMaskedLM(RobertaModel(**roberta_arch))
    if paddle.distributed.get_world_size() > 1:
        model = paddle.DataParallel(model)
    ignore_label = IGNORE
    loss_fct = paddle.nn.loss.CrossEntropyLoss(ignore_index=ignore_label)

    # Load wikipedia dataset via Hugging face datasets
    # TO DO: paddle datasets
    import datasets
    tokenized_datasets = datasets.load_from_disk(args.input_file)
    train_ds = tokenized_datasets["train"]
    from transformers import AutoTokenizer
    tokenizer = AutoTokenizer.from_pretrained('roberta-base')

    # Prepare data for training
    collator_func = DataCollatorMLM(tokenizer=tokenizer) # data collator
    train_batch_sampler = paddle.io.DistributedBatchSampler(
        train_ds, batch_size=args.batch_size, shuffle=True, drop_last=True)
    train_data_loader = DataLoader(
        dataset=train_ds,
        collate_fn=collator_func,
        num_workers=0,
        batch_sampler=train_batch_sampler,
        return_list=True)

    # Generate parameter names needed to perform weight decay.
    # All bias and LayerNorm parameters are excluded.
    decay_params = [
        p.name for n, p in model.named_parameters()
        if not any(nd in n for nd in ["bias", "norm"])
    ]
    num_training_steps = args.max_steps if args.max_steps > 0 else len(train_data_loader) * args.num_train_epochs
    lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps,
                                         args.warmup_steps)
    optimizer = paddle.optimizer.AdamW(
        learning_rate=lr_scheduler,
        epsilon=args.adam_epsilon,
        parameters=model.parameters(),
        weight_decay=args.weight_decay,
        apply_decay_param_fun=lambda x: x in decay_params)
    if args.amp: #mixed precision (fp16)
        scaler = paddle.amp.GradScaler(init_loss_scaling=args.scale_loss)

    # Start training
    global_step = 0
    tic_train = time.time()
    for epoch in range(args.num_train_epochs):
        for step, batch in enumerate(train_data_loader):
            input_ids, _, labels = batch
            with paddle.amp.auto_cast(
                        args.amp,
                        #custom_white_list=["layer_norm", "softmax", "gelu"]
                        ):
                logits = model(input_ids=input_ids)
                loss = loss_fct(logits, labels)
            if args.amp:
                scaler.scale(loss).backward()
                scaler.minimize(optimizer, loss)
                # print(args.amp, args.learning_rate, args.weight_decay)
            else:
                loss.backward()
                optimizer.step()

            lr_scheduler.step()
            optimizer.clear_grad()

            global_step += 1
            if global_step % args.logging_steps == 0:

                print(
                    "global step %d/%d, loss: %f, lr: %.10f, speed: %.4f step/s"
                    % (global_step, num_training_steps, loss, optimizer.get_lr(),
                        args.logging_steps / (time.time() - tic_train)))
                tic_train = time.time()

            if global_step % args.save_steps == 0:
                if paddle.distributed.get_rank() == 0:
                    output_dir = os.path.join(args.output_dir,
                                                "paddle_%d" % global_step)
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)

                    model_to_save = model._layers if isinstance(
                        model, paddle.DataParallel) else model
                    model_to_save.save_pretrained(output_dir)
                    tokenizer.save_pretrained(output_dir)
예제 #20
0
    train_data_loader = create_dataloader(train_ds,
                                          mode='train',
                                          batch_size=args.batch_size,
                                          batchify_fn=batchify_fn,
                                          trans_fn=trans_func)
    dev_data_loader = create_dataloader(dev_ds,
                                        mode='dev',
                                        batch_size=args.batch_size,
                                        batchify_fn=batchify_fn,
                                        trans_fn=trans_func)

    if args.init_from_ckpt and os.path.isfile(args.init_from_ckpt):
        state_dict = paddle.load(args.init_from_ckpt)
        model.set_dict(state_dict)
    model = paddle.DataParallel(model)

    num_training_steps = len(train_data_loader) * args.epochs
    # Generate parameter names needed to perform weight decay.
    # All bias and LayerNorm parameters are excluded.
    decay_params = [
        p.name for n, p in model.named_parameters()
        if not any(nd in n for nd in ["bias", "norm"])
    ]
    optimizer = paddle.optimizer.AdamW(
        learning_rate=args.learning_rate,
        parameters=model.parameters(),
        weight_decay=args.weight_decay,
        apply_decay_param_fun=lambda x: x in decay_params)
    criterion = paddle.nn.loss.CrossEntropyLoss()
    metric = paddle.metric.Accuracy()
예제 #21
0
def run(args):
    paddle.set_device(args.device)
    if paddle.distributed.get_world_size() > 1:
        paddle.distributed.init_parallel_env()
    rank = paddle.distributed.get_rank()
    args.model_type = args.model_type.lower()
    model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
    tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path)

    if args.version_2_with_negative:
        train_examples = load_dataset('squad_v2', split='train')
        dev_examples = load_dataset('squad_v2', split='validation')
    else:
        train_examples = load_dataset('squad', split='train')
        dev_examples = load_dataset('squad', split='validation')

    column_names = train_examples.column_names
    set_seed(args)
    if rank == 0:
        if os.path.exists(args.model_name_or_path):
            print("Loads checkpoint from %s" % args.model_name_or_path)

    model = model_class.from_pretrained(args.model_name_or_path)

    if paddle.distributed.get_world_size() > 1:
        model = paddle.DataParallel(model)

    if args.do_train:
        train_ds = train_examples.map(partial(prepare_train_features,
                                              tokenizer=tokenizer,
                                              args=args),
                                      batched=True,
                                      remove_columns=column_names,
                                      num_proc=4)
        train_batch_sampler = paddle.io.DistributedBatchSampler(
            train_ds, batch_size=args.batch_size, shuffle=True)
        train_batchify_fn = lambda samples, fn=Dict(
            {
                "input_ids": Pad(axis=0, pad_val=tokenizer.pad_token_id),
                "token_type_ids": Pad(axis=0,
                                      pad_val=tokenizer.pad_token_type_id),
                "start_positions": Stack(dtype="int64"),
                "end_positions": Stack(dtype="int64")
            }): fn(samples)

        train_data_loader = DataLoader(dataset=train_ds,
                                       batch_sampler=train_batch_sampler,
                                       collate_fn=train_batchify_fn,
                                       return_list=True)

        num_training_steps = args.max_steps if args.max_steps > 0 else len(
            train_data_loader) * args.num_train_epochs
        num_train_epochs = math.ceil(num_training_steps /
                                     len(train_data_loader))

        lr_scheduler = LinearDecayWithWarmup(args.learning_rate,
                                             num_training_steps,
                                             args.warmup_proportion)

        # Generate parameter names needed to perform weight decay.
        # All bias and LayerNorm parameters are excluded.
        decay_params = [
            p.name for n, p in model.named_parameters()
            if not any(nd in n for nd in ["bias", "norm"])
        ]
        optimizer = paddle.optimizer.AdamW(
            learning_rate=lr_scheduler,
            epsilon=args.adam_epsilon,
            parameters=model.parameters(),
            weight_decay=args.weight_decay,
            apply_decay_param_fun=lambda x: x in decay_params)
        criterion = CrossEntropyLossForSQuAD()

        global_step = 0
        tic_train = time.time()

        for epoch in range(num_train_epochs):
            for step, batch in enumerate(train_data_loader):
                global_step += 1
                input_ids, _, start_positions, end_positions = batch
                logits = model(input_ids)
                loss = criterion(logits, (start_positions, end_positions))

                if global_step % args.logging_steps == 0:
                    print(
                        "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s"
                        % (global_step, epoch + 1, step + 1, loss,
                           args.logging_steps / (time.time() - tic_train)))
                    tic_train = time.time()
                loss.backward()
                optimizer.step()
                lr_scheduler.step()
                optimizer.clear_grad()

                if global_step % args.save_steps == 0 or global_step == num_training_steps:
                    if rank == 0:
                        output_dir = os.path.join(args.output_dir,
                                                  "model_%d" % global_step)
                        if not os.path.exists(output_dir):
                            os.makedirs(output_dir)
                        # need better way to get inner model of DataParallel
                        model_to_save = model._layers if isinstance(
                            model, paddle.DataParallel) else model
                        model_to_save.save_pretrained(output_dir)
                        tokenizer.save_pretrained(output_dir)
                        print('Saving checkpoint to:', output_dir)
                    if global_step == num_training_steps:
                        break

    if args.do_predict and rank == 0:
        dev_ds = dev_examples.map(partial(prepare_validation_features,
                                          tokenizer=tokenizer,
                                          args=args),
                                  batched=True,
                                  remove_columns=column_names,
                                  num_proc=4)
        dev_batch_sampler = paddle.io.BatchSampler(dev_ds,
                                                   batch_size=args.batch_size,
                                                   shuffle=False)

        dev_batchify_fn = lambda samples, fn=Dict({
            "input_ids":
            Pad(axis=0, pad_val=tokenizer.pad_token_id),
            "token_type_ids":
            Pad(axis=0, pad_val=tokenizer.pad_token_type_id)
        }): fn(samples)

        dev_data_loader = DataLoader(dataset=dev_ds,
                                     batch_sampler=dev_batch_sampler,
                                     collate_fn=dev_batchify_fn,
                                     return_list=True)

        evaluate(model, dev_data_loader, args)
예제 #22
0
def train(env):
    """Train"""
    args = env.args

    logging.info("loading ddparser_data.")
    train = Corpus.load(args.train_data_path, env.fields)
    dev = Corpus.load(args.valid_data_path, env.fields)
    test = Corpus.load(args.test_data_path, env.fields)
    logging.info("init dataset.")
    train = TextDataset(train, env.fields, args.buckets)
    dev = TextDataset(dev, env.fields, args.buckets)
    test = TextDataset(test, env.fields, args.buckets)
    logging.info("set the ddparser_data loaders.")
    train.loader = batchify(train, args.batch_size, args.use_data_parallel,
                            True)
    dev.loader = batchify(dev, args.batch_size)
    test.loader = batchify(test, args.batch_size)

    logging.info("{:6} {:5} sentences, ".format('train:', len(train)) +
                 "{:3} batches, ".format(len(train.loader)) +
                 "{} buckets".format(len(train.buckets)))
    logging.info("{:6} {:5} sentences, ".format('dev:', len(dev)) +
                 "{:3} batches, ".format(len(dev.loader)) +
                 "{} buckets".format(len(dev.buckets)))
    logging.info("{:6} {:5} sentences, ".format('test:', len(test)) +
                 "{:3} batches, ".format(len(test.loader)) +
                 "{} buckets".format(len(test.buckets)))

    logging.info("Create the model")
    model = Model(args)

    # init parallel strategy
    if args.use_data_parallel:
        dist.init_parallel_env()
        model = paddle.DataParallel(model)

    if args.encoding_model.startswith(
            "ernie"
    ) and args.encoding_model != "ernie-lstm" or args.encoding_model == 'transformer':
        args['lr'] = args.ernie_lr
    else:
        args['lr'] = args.lstm_lr

    if args.encoding_model.startswith(
            "ernie") and args.encoding_model != "ernie-lstm":
        max_steps = 100 * len(train.loader)
        decay = LinearDecay(args.lr, int(args.warmup_proportion * max_steps),
                            max_steps)
    else:
        decay = dygraph.ExponentialDecay(learning_rate=args.lr,
                                         decay_steps=args.decay_steps,
                                         decay_rate=args.decay)

    grad_clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=args.clip)

    if args.encoding_model.startswith(
            "ernie") and args.encoding_model != "ernie-lstm":
        optimizer = AdamW(
            learning_rate=decay,
            parameter_list=model.parameters(),
            weight_decay=args.weight_decay,
            grad_clip=grad_clip,
        )
    else:
        optimizer = fluid.optimizer.AdamOptimizer(
            learning_rate=decay,
            beta1=args.mu,
            beta2=args.nu,
            epsilon=args.epsilon,
            parameter_list=model.parameters(),
            grad_clip=grad_clip,
        )

    total_time = datetime.timedelta()
    best_e, best_metric = 1, Metric()

    puncts = dygraph.to_variable(env.puncts, zero_copy=False)
    logging.info("start training.")

    for epoch in range(1, args.epochs + 1):
        start = datetime.datetime.now()
        # train one epoch and update the parameter
        logging.info("Epoch {} / {}:".format(epoch, args.epochs))
        epoch_train(args, model, optimizer, train.loader, epoch)
        if args.local_rank == 0:
            loss, dev_metric = epoch_evaluate(args, model, dev.loader, puncts)
            logging.info("{:6} Loss: {:.4f} {}".format('dev:', loss,
                                                       dev_metric))
            loss, test_metric = epoch_evaluate(args, model, test.loader,
                                               puncts)
            logging.info("{:6} Loss: {:.4f} {}".format('test:', loss,
                                                       test_metric))

            t = datetime.datetime.now() - start
            # save the model if it is the best so far
            if dev_metric > best_metric and epoch > args.patience // 10:
                best_e, best_metric = epoch, dev_metric
                save(args.model_path, args, model, optimizer)
                logging.info("{}s elapsed (saved)\n".format(t))
            else:
                logging.info("{}s elapsed\n".format(t))
            total_time += t
            if epoch - best_e >= args.patience:
                break
    if args.local_rank == 0:
        model = load(args.model_path, model)
        loss, metric = epoch_evaluate(args, model, test.loader, puncts)
        logging.info("max score of dev is {:.2%} at epoch {}".format(
            best_metric.score, best_e))
        logging.info("the score of test at epoch {} is {:.2%}".format(
            best_e, metric.score))
        logging.info("average time of each epoch is {}s".format(total_time /
                                                                epoch))
        logging.info("{}s elapsed".format(total_time))
예제 #23
0
    def train(self, validate=False):
        assert self.mode == 'train', "Model not in 'train' mode"
        Init_mark = False

        # if validation in training is enabled, metrics should be re-init
        if validate:
            self._init_metrics(validate=validate)
            self._reset_metrics()

        model = self.model
        if self.cfg.get('fleet', False):
            model = fleet.distributed_model(model)
            self.optimizer = fleet.distributed_optimizer(self.optimizer)
        elif self._nranks > 1:
            find_unused_parameters = self.cfg[
                'find_unused_parameters'] if 'find_unused_parameters' in self.cfg else False
            model = paddle.DataParallel(
                self.model, find_unused_parameters=find_unused_parameters)

        # initial fp16
        if self.cfg.get('fp16', False):
            scaler = amp.GradScaler(
                enable=self.cfg.use_gpu, init_loss_scaling=1024)

        self.status.update({
            'epoch_id': self.start_epoch,
            'step_id': 0,
            'steps_per_epoch': len(self.loader)
        })

        self.status['batch_time'] = stats.SmoothedValue(
            self.cfg.log_iter, fmt='{avg:.4f}')
        self.status['data_time'] = stats.SmoothedValue(
            self.cfg.log_iter, fmt='{avg:.4f}')
        self.status['training_staus'] = stats.TrainingStats(self.cfg.log_iter)

        if self.cfg.get('print_flops', False):
            self._flops(self.loader)

        self._compose_callback.on_train_begin(self.status)

        for epoch_id in range(self.start_epoch, self.cfg.epoch):
            self.status['mode'] = 'train'
            self.status['epoch_id'] = epoch_id
            self._compose_callback.on_epoch_begin(self.status)
            self.loader.dataset.set_epoch(epoch_id)
            model.train()
            iter_tic = time.time()
            for step_id, data in enumerate(self.loader):
                self.status['data_time'].update(time.time() - iter_tic)
                self.status['step_id'] = step_id
                self._compose_callback.on_step_begin(self.status)

                if self.cfg.get('fp16', False):
                    with amp.auto_cast(enable=self.cfg.use_gpu):
                        # model forward
                        outputs = model(data)
                        loss = outputs['loss']

                    # model backward
                    scaled_loss = scaler.scale(loss)
                    scaled_loss.backward()
                    # in dygraph mode, optimizer.minimize is equal to optimizer.step
                    scaler.minimize(self.optimizer, scaled_loss)
                else:
                    # model forward
                    outputs = model(data)
                    loss = outputs['loss']
                    # model backward
                    loss.backward()
                    self.optimizer.step()

                curr_lr = self.optimizer.get_lr()
                self.lr.step()
                self.optimizer.clear_grad()
                self.status['learning_rate'] = curr_lr

                if self._nranks < 2 or self._local_rank == 0:
                    self.status['training_staus'].update(outputs)

                self.status['batch_time'].update(time.time() - iter_tic)
                self._compose_callback.on_step_end(self.status)
                if self.use_ema:
                    self.ema.update(self.model)
                iter_tic = time.time()

            # apply ema weight on model
            if self.use_ema:
                weight = copy.deepcopy(self.model.state_dict())
                self.model.set_dict(self.ema.apply())

            self._compose_callback.on_epoch_end(self.status)

            if validate and (self._nranks < 2 or self._local_rank == 0) \
                    and ((epoch_id + 1) % self.cfg.snapshot_epoch == 0 \
                             or epoch_id == self.end_epoch - 1):
                if not hasattr(self, '_eval_loader'):
                    # build evaluation dataset and loader
                    self._eval_dataset = self.cfg.EvalDataset
                    self._eval_batch_sampler = \
                        paddle.io.BatchSampler(
                            self._eval_dataset,
                            batch_size=self.cfg.EvalReader['batch_size'])
                    self._eval_loader = create('EvalReader')(
                        self._eval_dataset,
                        self.cfg.worker_num,
                        batch_sampler=self._eval_batch_sampler)
                # if validation in training is enabled, metrics should be re-init
                # Init_mark makes sure this code will only execute once
                if validate and Init_mark == False:
                    Init_mark = True
                    self._init_metrics(validate=validate)
                    self._reset_metrics()
                with paddle.no_grad():
                    self.status['save_best_model'] = True
                    self._eval_with_loader(self._eval_loader)

            # restore origin weight on model
            if self.use_ema:
                self.model.set_dict(weight)

        self._compose_callback.on_train_end(self.status)
예제 #24
0
def train(args):
    os.makedirs(args.output_dir, exist_ok=True)
    set_seed(args)

    label2id_map, id2label_map = get_label_maps()
    pad_token_label_id = paddle.nn.CrossEntropyLoss().ignore_index

    # dist mode
    if paddle.distributed.get_world_size() > 1:
        paddle.distributed.init_parallel_env()

    tokenizer = LayoutXLMTokenizer.from_pretrained(args.model_name_or_path)
    base_model = LayoutXLMModel.from_pretrained(args.model_name_or_path)
    model = LayoutXLMForRelationExtraction(base_model, dropout=None)

    # dist mode
    if paddle.distributed.get_world_size() > 1:
        model = paddle.DataParallel(model)

    train_dataset = XFUN(tokenizer,
                         data_dir=args.train_data_dir,
                         label_path=args.train_label_path,
                         label2id_map=label2id_map,
                         img_size=(224, 224),
                         max_seq_len=args.max_seq_length,
                         pad_token_label_id=pad_token_label_id,
                         contains_re=True,
                         add_special_ids=False,
                         return_attention_mask=True,
                         load_mode='all')

    eval_dataset = XFUN(tokenizer,
                        data_dir=args.eval_data_dir,
                        label_path=args.eval_label_path,
                        label2id_map=label2id_map,
                        img_size=(224, 224),
                        max_seq_len=args.max_seq_length,
                        pad_token_label_id=pad_token_label_id,
                        contains_re=True,
                        add_special_ids=False,
                        return_attention_mask=True,
                        load_mode='all')

    train_sampler = paddle.io.DistributedBatchSampler(
        train_dataset, batch_size=args.per_gpu_train_batch_size, shuffle=True)
    args.train_batch_size = args.per_gpu_train_batch_size * max(
        1, paddle.distributed.get_world_size())
    train_dataloader = paddle.io.DataLoader(train_dataset,
                                            batch_sampler=train_sampler,
                                            num_workers=8,
                                            use_shared_memory=True,
                                            collate_fn=DataCollator())

    eval_dataloader = paddle.io.DataLoader(
        eval_dataset,
        batch_size=args.per_gpu_eval_batch_size,
        num_workers=8,
        shuffle=False,
        collate_fn=DataCollator())

    t_total = len(train_dataloader) * args.num_train_epochs

    # build linear decay with warmup lr sch
    lr_scheduler = paddle.optimizer.lr.PolynomialDecay(
        learning_rate=args.learning_rate,
        decay_steps=t_total,
        end_lr=0.0,
        power=1.0)
    if args.warmup_steps > 0:
        lr_scheduler = paddle.optimizer.lr.LinearWarmup(
            lr_scheduler,
            args.warmup_steps,
            start_lr=0,
            end_lr=args.learning_rate,
        )
    grad_clip = paddle.nn.ClipGradByNorm(clip_norm=10)
    optimizer = paddle.optimizer.Adam(learning_rate=args.learning_rate,
                                      parameters=model.parameters(),
                                      epsilon=args.adam_epsilon,
                                      grad_clip=grad_clip,
                                      weight_decay=args.weight_decay)

    # Train!
    logger.info("***** Running training *****")
    logger.info(f"  Num examples = {len(train_dataset)}")
    logger.info(f"  Num Epochs = {args.num_train_epochs}")
    logger.info(
        f"  Instantaneous batch size per GPU = {args.per_gpu_train_batch_size}"
    )
    logger.info(
        f"  Total train batch size (w. parallel, distributed & accumulation) = {args.train_batch_size * paddle.distributed.get_world_size()}"
    )
    logger.info(f"  Total optimization steps = {t_total}")

    global_step = 0
    train_dataloader_len = len(train_dataloader)
    best_metirc = {'f1': 0}
    model.train()

    for epoch in range(int(args.num_train_epochs)):
        for step, batch in enumerate(train_dataloader):
            outputs = model(**batch)
            # model outputs are always tuple in ppnlp (see doc)
            loss = outputs['loss']
            loss = loss.mean()

            logger.info(
                f"epoch: [{epoch}/{args.num_train_epochs}], iter: [{step}/{train_dataloader_len}], global_step:{global_step}, train loss: {np.mean(loss.numpy())}, lr: {optimizer.get_lr()}"
            )

            loss.backward()
            optimizer.step()
            optimizer.clear_grad()
            # lr_scheduler.step()  # Update learning rate schedule

            global_step += 1

            if (paddle.distributed.get_rank() == 0 and args.eval_steps > 0
                    and global_step % args.eval_steps == 0):
                # Log metrics
                if paddle.distributed.get_rank(
                ) == 0 and args.evaluate_during_training:
                    results = evaluate(model, eval_dataloader, logger)
                    if results['f1'] > best_metirc['f1']:
                        best_metirc = results
                        output_dir = os.path.join(args.output_dir,
                                                  "checkpoint-best")
                        os.makedirs(output_dir, exist_ok=True)
                        model.save_pretrained(output_dir)
                        tokenizer.save_pretrained(output_dir)
                        paddle.save(
                            args, os.path.join(output_dir,
                                               "training_args.bin"))
                        logger.info(f"Saving model checkpoint to {output_dir}")
                    logger.info(f"eval results: {results}")
                    logger.info(f"best_metirc: {best_metirc}")

            if (paddle.distributed.get_rank() == 0 and args.save_steps > 0
                    and global_step % args.save_steps == 0):
                # Save model checkpoint
                output_dir = os.path.join(args.output_dir, "checkpoint-latest")
                os.makedirs(output_dir, exist_ok=True)
                if paddle.distributed.get_rank() == 0:
                    model.save_pretrained(output_dir)
                    tokenizer.save_pretrained(output_dir)
                    paddle.save(args,
                                os.path.join(output_dir, "training_args.bin"))
                    logger.info(f"Saving model checkpoint to {output_dir}")
    logger.info(f"best_metirc: {best_metirc}")
예제 #25
0
def main(args):
    """tbd"""
    compound_encoder_config = load_json_config(args.compound_encoder_config)
    model_config = load_json_config(args.model_config)
    if not args.dropout_rate is None:
        compound_encoder_config['dropout_rate'] = args.dropout_rate
        model_config['dropout_rate'] = args.dropout_rate

    compound_encoder = GeoGNNModel(compound_encoder_config)
    model = GeoPredModel(model_config, compound_encoder)
    if args.distributed:
        model = paddle.DataParallel(model)
    opt = paddle.optimizer.Adam(learning_rate=args.lr,
                                parameters=model.parameters())
    print('Total param num: %s' % (len(model.parameters())))
    for i, param in enumerate(model.named_parameters()):
        print(i, param[0], param[1].name)

    if not args.init_model is None and not args.init_model == "":
        compound_encoder.set_state_dict(paddle.load(args.init_model))
        print('Load state_dict from %s' % args.init_model)

    # get dataset
    dataset = load_smiles_to_dataset(args.data_path)
    if args.DEBUG:
        dataset = dataset[100:180]
    dataset = dataset[dist.get_rank()::dist.get_world_size()]
    smiles_lens = [len(smiles) for smiles in dataset]
    print('Total size:%s' % (len(dataset)))
    print('Dataset smiles min/max/avg length: %s/%s/%s' %
          (np.min(smiles_lens), np.max(smiles_lens), np.mean(smiles_lens)))
    transform_fn = GeoPredTransformFn(model_config['pretrain_tasks'],
                                      model_config['mask_ratio'])
    # this step will be time consuming due to rdkit 3d calculation
    dataset.transform(transform_fn, num_workers=args.num_workers)
    test_index = int(len(dataset) * (1 - args.test_ratio))
    train_dataset = dataset[:test_index]
    test_dataset = dataset[test_index:]
    print("Train/Test num: %s/%s" % (len(train_dataset), len(test_dataset)))

    collate_fn = GeoPredCollateFn(
        atom_names=compound_encoder_config['atom_names'],
        bond_names=compound_encoder_config['bond_names'],
        bond_float_names=compound_encoder_config['bond_float_names'],
        bond_angle_float_names=compound_encoder_config[
            'bond_angle_float_names'],
        pretrain_tasks=model_config['pretrain_tasks'],
        mask_ratio=model_config['mask_ratio'],
        Cm_vocab=model_config['Cm_vocab'])
    train_data_gen = train_dataset.get_data_loader(
        batch_size=args.batch_size,
        num_workers=args.num_workers,
        shuffle=True,
        collate_fn=collate_fn)

    list_test_loss = []
    for epoch_id in range(args.max_epoch):
        s = time.time()
        train_loss = train(args, model, opt, train_data_gen)
        test_loss = evaluate(args, model, test_dataset, collate_fn)
        if not args.distributed or dist.get_rank() == 0:
            paddle.save(compound_encoder.state_dict(),
                        '%s/epoch%d.pdparams' % (args.model_dir, epoch_id))
            list_test_loss.append(test_loss['loss'])
            print("epoch:%d train/loss:%s" % (epoch_id, train_loss))
            print("epoch:%d test/loss:%s" % (epoch_id, test_loss))
            print("Time used:%ss" % (time.time() - s))

    if not args.distributed or dist.get_rank() == 0:
        print('Best epoch id:%s' % np.argmin(list_test_loss))
예제 #26
0
def do_train():
    paddle.set_device(args.device)
    rank = paddle.distributed.get_rank()
    if paddle.distributed.get_world_size() > 1:
        paddle.distributed.init_parallel_env()

    set_seed(args.seed)

    train_ds = load_dataset(
        read_simcse_text, data_path=args.train_set_file, lazy=False)

    dev_ds = load_dataset(
        read_text_pair, data_path=args.test_set_file, lazy=False)

    pretrained_model = ppnlp.transformers.ErnieModel.from_pretrained(
       'ernie-1.0',
       hidden_dropout_prob=args.dropout,
       attention_probs_dropout_prob=args.dropout)

    tokenizer = ppnlp.transformers.ErnieTokenizer.from_pretrained('ernie-1.0')

    trans_func = partial(
        convert_example,
        tokenizer=tokenizer,
        max_seq_length=args.max_seq_length)

    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=tokenizer.pad_token_id),  # query_input
        Pad(axis=0, pad_val=tokenizer.pad_token_type_id),  # query_segment
        Pad(axis=0, pad_val=tokenizer.pad_token_id),  # title_input
        Pad(axis=0, pad_val=tokenizer.pad_token_type_id),  # tilte_segment
    ): [data for data in fn(samples)]

    dev_batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=tokenizer.pad_token_id),  # query_input
        Pad(axis=0, pad_val=tokenizer.pad_token_type_id),  # query_segment
        Pad(axis=0, pad_val=tokenizer.pad_token_id),  # title_input
        Pad(axis=0, pad_val=tokenizer.pad_token_type_id),  # tilte_segment
        Stack(dtype="int64"),  # labels
    ): [data for data in fn(samples)]

    train_data_loader = create_dataloader(
        train_ds,
        mode='train',
        batch_size=args.batch_size,
        batchify_fn=batchify_fn,
        trans_fn=trans_func)

    dev_data_loader = create_dataloader(
        dev_ds,
        mode='eval',
        batch_size=args.batch_size,
        batchify_fn=dev_batchify_fn,
        trans_fn=trans_func)

    model = SimCSE(
        pretrained_model,
        margin=args.margin,
        scale=args.scale,
        output_emb_size=args.output_emb_size)

    if args.init_from_ckpt and os.path.isfile(args.init_from_ckpt):
        state_dict = paddle.load(args.init_from_ckpt)
        model.set_dict(state_dict)
        print("warmup from:{}".format(args.init_from_ckpt))

    model = paddle.DataParallel(model)

    num_training_steps = args.max_steps if args.max_steps > 0 else len(
        train_data_loader) * args.epochs

    lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps,
                                         args.warmup_proportion)

    # Generate parameter names needed to perform weight decay.
    # All bias and LayerNorm parameters are excluded.
    decay_params = [
        p.name for n, p in model.named_parameters()
        if not any(nd in n for nd in ["bias", "norm"])
    ]
    optimizer = paddle.optimizer.AdamW(
        learning_rate=lr_scheduler,
        parameters=model.parameters(),
        weight_decay=args.weight_decay,
        apply_decay_param_fun=lambda x: x in decay_params)

    global_step = 0
    tic_train = time.time()
    for epoch in range(1, args.epochs + 1):
        for step, batch in enumerate(train_data_loader, start=1):
            query_input_ids, query_token_type_ids, title_input_ids, title_token_type_ids = batch

            loss = model(
                query_input_ids=query_input_ids,
                title_input_ids=title_input_ids,
                query_token_type_ids=query_token_type_ids,
                title_token_type_ids=title_token_type_ids)

            global_step += 1
            if global_step % 10 == 0 and rank == 0:
                print(
                    "global step %d, epoch: %d, batch: %d, loss: %.5f, speed: %.2f step/s"
                    % (global_step, epoch, step, loss,
                       10 / (time.time() - tic_train)))
                tic_train = time.time()

            if global_step % args.eval_steps == 0 and rank == 0:
                # need better way to get model Layers
                spearman_corr, total_num = do_evaluate(model._layers, tokenizer, dev_data_loader, args.infer_with_fc_pooler)
                print("global step: {}, spearman_corr: {:.4f}, total_num: {}".format(global_step, spearman_corr, total_num))

            loss.backward()
            optimizer.step()
            lr_scheduler.step()
            optimizer.clear_grad()
            if global_step % args.save_steps == 0 and rank == 0:
                save_dir = os.path.join(args.save_dir, "model_%d" % global_step)
                if not os.path.exists(save_dir):
                    os.makedirs(save_dir)
                save_param_path = os.path.join(save_dir, 'model_state.pdparams')
                paddle.save(model.state_dict(), save_param_path)
                tokenizer.save_pretrained(save_dir)

            if args.max_steps > 0 and global_step >= args.max_steps:
                return
예제 #27
0
def do_train():
    paddle.set_device(args.device)
    rank = paddle.distributed.get_rank()
    if paddle.distributed.get_world_size() > 1:
        paddle.distributed.init_parallel_env()

    # Reads label_map.
    label_map_path = os.path.join(args.data_path, "predicate2id.json")
    if not (os.path.exists(label_map_path) and os.path.isfile(label_map_path)):
        sys.exit("{} dose not exists or is not a file.".format(label_map_path))
    with open(label_map_path, 'r', encoding='utf8') as fp:
        label_map = json.load(fp)
    num_classes = (len(label_map.keys()) - 2) * 2 + 2

    # Loads pretrained model ERNIE
    model = ErnieForTokenClassification.from_pretrained(
        "ernie-1.0", num_classes=num_classes)
    model = paddle.DataParallel(model)
    tokenizer = ErnieTokenizer.from_pretrained("ernie-1.0")
    criterion = BCELossForDuIE()

    # Loads dataset.
    train_dataset = DuIEDataset.from_file(
        os.path.join(args.data_path, 'train_data.json'), tokenizer,
        args.max_seq_length, True)
    train_batch_sampler = paddle.io.DistributedBatchSampler(
        train_dataset, batch_size=args.batch_size, shuffle=True, drop_last=True)
    collator = DataCollator()
    train_data_loader = DataLoader(
        dataset=train_dataset,
        batch_sampler=train_batch_sampler,
        collate_fn=collator,
        return_list=True)
    eval_file_path = os.path.join(args.data_path, 'dev_data.json')
    test_dataset = DuIEDataset.from_file(eval_file_path, tokenizer,
                                         args.max_seq_length, True)
    test_batch_sampler = paddle.io.BatchSampler(
        test_dataset, batch_size=args.batch_size, shuffle=False, drop_last=True)
    test_data_loader = DataLoader(
        dataset=test_dataset,
        batch_sampler=test_batch_sampler,
        collate_fn=collator,
        return_list=True)

    # Defines learning rate strategy.
    steps_by_epoch = len(train_data_loader)
    num_training_steps = steps_by_epoch * args.num_train_epochs
    lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps,
                                         args.warmup_ratio)
    # Generate parameter names needed to perform weight decay.
    # All bias and LayerNorm parameters are excluded.
    decay_params = [
        p.name for n, p in model.named_parameters()
        if not any(nd in n for nd in ["bias", "norm"])
    ]
    optimizer = paddle.optimizer.AdamW(
        learning_rate=lr_scheduler,
        parameters=model.parameters(),
        weight_decay=args.weight_decay,
        apply_decay_param_fun=lambda x: x in decay_params)

    # Starts training.
    global_step = 0
    logging_steps = 50
    save_steps = 10000
    tic_train = time.time()
    for epoch in range(args.num_train_epochs):
        print("\n=====start training of %d epochs=====" % epoch)
        tic_epoch = time.time()
        model.train()
        for step, batch in enumerate(train_data_loader):
            input_ids, seq_lens, tok_to_orig_start_index, tok_to_orig_end_index, labels = batch
            logits = model(input_ids=input_ids)
            mask = (input_ids != 0).logical_and((input_ids != 1)).logical_and(
                (input_ids != 2))
            loss = criterion(logits, labels, mask)
            loss.backward()
            optimizer.step()
            lr_scheduler.step()
            optimizer.clear_grad()
            loss_item = loss.numpy().item()
            global_step += 1

            if global_step % logging_steps == 0 and rank == 0:
                print(
                    "epoch: %d / %d, steps: %d / %d, loss: %f, speed: %.2f step/s"
                    % (epoch, args.num_train_epochs, step, steps_by_epoch,
                       loss_item, logging_steps / (time.time() - tic_train)))
                tic_train = time.time()

            if global_step % save_steps == 0 and rank == 0:
                print("\n=====start evaluating ckpt of %d steps=====" %
                      global_step)
                precision, recall, f1 = evaluate(
                    model, criterion, test_data_loader, eval_file_path, "eval")
                print("precision: %.2f\t recall: %.2f\t f1: %.2f\t" %
                      (100 * precision, 100 * recall, 100 * f1))
                print("saving checkpoing model_%d.pdparams to %s " %
                      (global_step, args.output_dir))
                paddle.save(model.state_dict(),
                            os.path.join(args.output_dir,
                                         "model_%d.pdparams" % global_step))
                model.train()  # back to train mode

        tic_epoch = time.time() - tic_epoch
        print("epoch time footprint: %d hour %d min %d sec" %
              (tic_epoch // 3600, (tic_epoch % 3600) // 60, tic_epoch % 60))

    # Does final evaluation.
    if rank == 0:
        print("\n=====start evaluating last ckpt of %d steps=====" %
              global_step)
        precision, recall, f1 = evaluate(model, criterion, test_data_loader,
                                         eval_file_path, "eval")
        print("precision: %.2f\t recall: %.2f\t f1: %.2f\t" %
              (100 * precision, 100 * recall, 100 * f1))
        paddle.save(model.state_dict(),
                    os.path.join(args.output_dir,
                                 "model_%d.pdparams" % global_step))
        print("\n=====training complete=====")
예제 #28
0
def main(args):
    paddle.set_device(args.device)
    world_size = dist.get_world_size()
    rank = dist.get_rank()
    if world_size > 1 and args.do_train:
        dist.init_parallel_env()

    set_seed(args.seed)

    dataset_class, metric_class = TASK_CLASSES[args.task_name]
    tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path)
    trans_func = partial(
        dataset_class.convert_example,
        tokenizer=tokenizer,
        max_seq_length=args.max_seq_len)
    test_trans_func = partial(
        dataset_class.convert_example,
        tokenizer=tokenizer,
        max_seq_length=args.test_max_seq_len)
    metric = metric_class()

    if args.task_name in ('udc', 'dstc2', 'atis_intent', 'mrda', 'swda'):
        batchify_fn = lambda samples, fn=Tuple(
            Pad(axis=0, pad_val=tokenizer.pad_token_id),  # input
            Pad(axis=0, pad_val=tokenizer.pad_token_type_id),  # segment
            Stack(dtype='int64')  # label
        ): fn(samples)
        model = BertForSequenceClassification.from_pretrained(
            args.model_name_or_path, num_classes=dataset_class.num_classes())
    elif args.task_name == 'atis_slot':
        batchify_fn = lambda samples, fn=Tuple(
            Pad(axis=0, pad_val=tokenizer.pad_token_id),  # input
            Pad(axis=0, pad_val=tokenizer.pad_token_type_id),  # segment
            Pad(axis=0, pad_val=0, dtype='int64')  # label
        ): fn(samples)
        model = BertForTokenClassification.from_pretrained(
            args.model_name_or_path,
            num_classes=dataset_class.num_classes(),
            dropout=0.0)
    if world_size > 1 and args.do_train:
        model = paddle.DataParallel(model)

    if args.do_train:
        train_data_loader = create_data_loader(args, dataset_class, trans_func,
                                               batchify_fn, 'train')
        if args.do_eval:
            dev_data_loader = create_data_loader(
                args, dataset_class, test_trans_func, batchify_fn, 'dev')
        else:
            dev_data_loader = None
        train(args, model, train_data_loader, dev_data_loader, metric,
              world_size, rank)

    if args.do_test:
        if rank == 0:
            test_data_loader = create_data_loader(
                args, dataset_class, test_trans_func, batchify_fn, 'test')
            if args.do_train:
                # If do_eval=True, use best model to evaluate the test data.
                # Otherwise, use final model to evaluate the test data.
                if args.do_eval:
                    args.init_from_ckpt = os.path.join(args.output_dir, 'best')
                    load_ckpt(args, model)
            else:
                if not args.init_from_ckpt:
                    raise ValueError('"init_from_ckpt" should be set.')
                load_ckpt(args, model)
            print('\nTest begin...')
            evaluation(args, model, test_data_loader, metric)
예제 #29
0
파일: pretrain.py 프로젝트: zs960114/ERNIE
    param_name_to_exclue_from_weight_decay = re.compile(
        r'.*layer_norm_scale|.*layer_norm_bias|.*b_0')

    lr_scheduler = P.optimizer.lr.LambdaDecay(
        args.lr, get_warmup_and_linear_decay(args.max_steps,
                                             args.warmup_steps))
    g_clip = P.nn.ClipGradByGlobalNorm(1.0)  #experimental

    opt = P.optimizer.AdamW(learning_rate=lr_scheduler,
                            parameters=model.parameters(),
                            apply_decay_param_fun=lambda n:
                            param_name_to_exclue_from_weight_decay.match(n),
                            weight_decay=args.wd,
                            grad_clip=g_clip)

    model = P.DataParallel(model)

    scaler = P.amp.GradScaler(enable=args.use_amp)
    create_if_not_exists(args.save_dir)
    with P.amp.auto_cast(args.use_amp):
        for step, samples in enumerate(
                P.io.DataLoader(train_ds,
                                places=P.CUDAPlace(env.dev_id),
                                batch_size=0)):
            (src_ids, sent_ids, mlm_label, mask_pos, nsp_label) = samples
            loss, mlmloss, nsploss = model(src_ids,
                                           sent_ids,
                                           labels=mlm_label,
                                           mlm_pos=mask_pos,
                                           nsp_labels=nsp_label)
            loss = scaler.scale(loss)
예제 #30
0
def run(args):
    # initializen distributed env
    if args.is_parallel == 1:
        dist.init_parallel_env()

    data_path = args.data_path + args.dataset + '/'

    CVs = ['CV1', 'CV2', 'CV3', 'CV4', 'CV5']

    for CV in CVs:
        print('><<><><><><><><><><><><><><><><><><><><><><><><><<><><><><><>')
        print('start {}'.format(CV))

        ##################### load the data ############################
        train_file = CV + '_' + args.dataset + '_' + args.split + '_' + 'train' + '.csv'
        val_file = CV + '_' + args.dataset + '_' + args.split + '_' + 'val' + '.csv'
        test = 'test_' + args.dataset + '_' + args.split + '.csv'

        print('Load data...')
        r_train = pd.read_csv(data_path + CV + '/' + train_file)
        r_train = r_train.reset_index(drop=True)
        r_val = pd.read_csv(data_path + CV + '/' + val_file)
        r_val = r_val.reset_index(drop=True)
        r_test = pd.read_csv(data_path + test)
        r_test = r_test.reset_index(drop=True)

        if args.is_mixed:
            # load the mixed data
            if args.dataset == 'DAVIS':
                mixed_dataset = 'KIBA'
            if args.dataset == 'KIBA':
                mixed_dataset = 'DAVIS'

            # load the mixed data
            mixed_data_file = mixed_dataset + '_mixed_train_unseenP_seenD.csv'
            mixed_data = pd.read_csv(data_path + mixed_data_file)
            mixed_data = mixed_data.reset_index(drop=True)
            # remove the repeated protein sequence
            val_t = r_val['Target Sequence'].unique()
            mixed_t = mixed_data['Target Sequence'].unique()
            filter1 = list((set(val_t).intersection(set(mixed_t))))
            mixed_data = mixed_data[~mixed_data['Target Sequence'].isin(filter1
                                                                        )]
            mixed_data = mixed_data.reset_index(drop=True)

        r_train = load_customised_Davis(r_train)
        r_val = load_customised_Davis(r_val)
        r_test = load_customised_Davis(r_test)
        if args.is_mixed:
            r_mixed = load_customised_Davis(mixed_data)

        LEN_train = len(r_train)

        print('number of train samples are {}'.format(len(r_train)))
        print('number of validation samples are {}'.format(len(r_val)))
        print('number of test samples are {}'.format(len(r_test)))
        if args.is_mixed:
            r_mixed = load_customised_Davis(mixed_data)
            print('number of mixed samples are {}'.format(len(r_mixed)))
        print('Load done.\n')

        if args.is_mixed:
            # concatenate the data
            r_train = np.concatenate((r_train, r_mixed))

        ###### get the protein group and index for train/val/test
        qid_doc_map_train = group_by(r_train, 0)
        query_idx_train = qid_doc_map_train.keys()
        train_keys = np.array(list(query_idx_train))

        if args.is_mixed:
            id_doc_map_mixed = group_by(r_mixed, 0)
            query_idx_mixed = id_doc_map_mixed.keys()
            mixed_keys = np.array(list(query_idx_mixed))

        qid_doc_map_val = group_by(r_val, 0)
        query_idx_val = qid_doc_map_val.keys()
        val_keys = np.array(list(query_idx_val))

        qid_doc_map_test = group_by(r_test, 0)
        query_idx_test = qid_doc_map_test.keys()
        test_keys = np.array(list(query_idx_test))
        ###### get the protein group and index for train/val/test

        # get the true scores of train
        true_scores = [
            r_train[qid_doc_map_train[qid], 1] for qid in query_idx_train
        ]
        if args.is_mixed:
            true_scores_mixed = [
                r_mixed[id_doc_map_mixed[qid], 1] for qid in query_idx_mixed
            ]

        ###### get val/test dataloader
        val_index = []
        for qid in val_keys:
            val_index.append(qid_doc_map_val[qid])
        val_dataset = Data_test(val_index, r_val)
        val_dataloader = paddle.io.DataLoader(val_dataset,
                                              batch_size=args.test_batch_size,
                                              shuffle=True)

        test_index = []
        for qid in test_keys:
            test_index.append(qid_doc_map_test[qid])
        test_dataset = Data_test(test_index, r_test)
        test_dataloader = paddle.io.DataLoader(test_dataset,
                                               batch_size=args.test_batch_size,
                                               shuffle=True)

        # Load model
        model_config = json.load(open(args.model_config_path, 'r'))
        model = MolTransModel(model_config)
        len_SMILES = model_config['drug_max_seq']
        len_target = model_config['target_max_seq']

        if args.is_parallel == 1:
            model_parallel = paddle.DataParallel(model)
        else:
            model_parallel = model

        # define the optimizer
        optimizer = paddle.optimizer.AdamW(
            parameters=model_parallel.parameters(),
            weight_decay=0.01,
            learning_rate=args.learning_rate)

        print('start to train the model...')
        for epoch in range(args.N_epoch):
            ##################### resampling the pairs for each epoch #####################
            train_x1_index, train_x2_index, train_scores, Y_train = sample_pairs(
                true_scores,
                K=args.sampling_N_train,
                eps=args.filter_threshold,
                seed=epoch)
            if args.is_mixed:
                mixed_x1_index, mixed_x2_index, mixed_scores, Y_mixed = sample_pairs(
                    true_scores_mixed,
                    K=args.sampling_N_mixed,
                    eps=args.filter_threshold,
                    seed=epoch)
                # mixed all pairs from train and mixed dataset
                temp = LEN_train
                mixed_x1_index = [i + temp for i in mixed_x1_index]
                mixed_x2_index = [i + temp for i in mixed_x2_index]

                train_x1_index = train_x1_index + mixed_x1_index
                train_x2_index = train_x2_index + mixed_x2_index

                Y_train = np.concatenate((Y_train, Y_mixed))

            train_dataset = Data_Encoder_flow(train_x1_index, train_x2_index,
                                              Y_train, r_train)

            if args.is_parallel:
                train_batch_sampler = paddle.io.DistributedBatchSampler(
                    train_dataset,
                    batch_size=args.train_batch_size,
                    shuffle=True)
                train_dataloader = paddle.io.DataLoader(
                    train_dataset, batch_sampler=train_batch_sampler)
            else:
                train_dataloader = paddle.io.DataLoader(
                    train_dataset,
                    batch_size=args.train_batch_size,
                    shuffle=True,
                    num_workers=23)
            ##################### resampling the pairs for each epoch #####################

            print('***************train')
            LOSS = []
            model.train()
            model_parallel.train()
            start_time = time.time()
            for batch_id, data in enumerate(train_dataloader()):
                batch_x1 = data[0]
                batch_x2 = data[1]
                batch_y = data[2]

                ###### define loss and optimization function
                loss_ = nn.BCEWithLogitsLoss()

                # split to smiles and protein
                batch_x1_smiles = batch_x1[:, 0:len_SMILES].astype('int64')
                batch_x1_protein = batch_x1[:, len_SMILES:len_SMILES +
                                            len_target].astype('int64')
                batch_x1_smiles_mask = batch_x1[:, len_SMILES +
                                                len_target:len_SMILES +
                                                len_target +
                                                len_SMILES].astype('int64')
                batch_x1_protein_mask = batch_x1[:, len_SMILES + len_target +
                                                 len_SMILES:].astype('int64')

                batch_x2_smiles = batch_x2[:, 0:len_SMILES].astype('int64')
                batch_x2_protein = batch_x2[:, len_SMILES:len_SMILES +
                                            len_target].astype('int64')
                batch_x2_smiles_mask = batch_x2[:, len_SMILES +
                                                len_target:len_SMILES +
                                                len_target +
                                                len_SMILES].astype('int64')
                batch_x2_protein_mask = batch_x2[:, len_SMILES + len_target +
                                                 len_SMILES:].astype('int64')

                optimizer.clear_grad()
                res = model_parallel(batch_x1_smiles, batch_x1_protein,
                                     batch_x2_smiles, batch_x2_protein,
                                     batch_x1_smiles_mask,
                                     batch_x1_protein_mask,
                                     batch_x2_smiles_mask,
                                     batch_x2_protein_mask)

                loss = loss_(res.squeeze(1), batch_y)

                loss.backward()
                optimizer.step()
                # scheduler.step()

                if batch_id % 100 == 0:
                    print('batch {} loss {}'.format(batch_id, loss.numpy()))

                LOSS.append(loss.numpy())

            end_time = time.time()
            print('take time {}'.format(end_time - start_time))
            print('epoch {}: loss: {} '.format(epoch, np.mean(LOSS)))

            # validation
            print('***************validation')
            val_average_CI, val_weighted_CI, val_overall_CI = model_eval(
                model, val_dataloader, len_SMILES, len_target)
            # test
            print('***************test')
            test_average_CI, test_weighted_CI, test_overall_CI = model_eval(
                model, test_dataloader, len_SMILES, len_target)

            if epoch == 0:
                best_average_CI = val_average_CI
                # save the best epoch
                paddle.save(model.state_dict(),
                            args.save_direct + CV + '_' + 'train_model_best')
                with open(args.save_direct + CV + '_' + "best_results.txt",
                          "w") as text_file:

                    text_file.write(
                        'epoch {}: loss: {} '.format(epoch, np.mean(LOSS)) +
                        '\n')
                    text_file.write(
                        "val Average CI is {}".format(val_average_CI) + '\n')
                    text_file.write(
                        "val weighted CI is {}".format(val_weighted_CI) + '\n')
                    text_file.write(
                        "val overall CI is {}".format(val_overall_CI) + '\n')

                    text_file.write(
                        "test Average CI is {}".format(test_average_CI) + '\n')
                    text_file.write(
                        "test weighted CI is {}".format(test_weighted_CI) +
                        '\n')
                    text_file.write(
                        "test overall CI is {}".format(test_overall_CI) + '\n')
                    text_file.write(
                        '##############################################' +
                        '\n')

            if (epoch != 0) & (val_average_CI >= best_average_CI):
                best_average_CI = val_average_CI
                # save the best epoch
                paddle.save(model.state_dict(),
                            args.save_direct + CV + '_' + 'train_model_best')
                with open(args.save_direct + CV + '_' + "best_results.txt",
                          "w") as text_file:
                    text_file.write(
                        'epoch {}: loss: {} '.format(epoch, np.mean(LOSS)) +
                        '\n')
                    text_file.write(
                        "val Average CI is {}".format(val_average_CI) + '\n')
                    text_file.write(
                        "val weighted CI is {}".format(val_weighted_CI) + '\n')
                    text_file.write(
                        "val overall CI is {}".format(val_overall_CI) + '\n')

                    text_file.write(
                        "test Average CI is {}".format(test_average_CI) + '\n')
                    text_file.write(
                        "test weighted CI is {}".format(test_weighted_CI) +
                        '\n')
                    text_file.write(
                        "test overall CI is {}".format(test_overall_CI) + '\n')
                    text_file.write(
                        '##############################################' +
                        '\n')

        print(
            '###############################################################')