예제 #1
0
def train(args):
    if dist.get_rank() == 0:
        shutil.rmtree('log', ignore_errors=True)
        # 日志记录器
        writer = LogWriter(logdir='log')

    # 设置支持多卡训练
    if len(args.gpus.split(',')) > 1:
        dist.init_parallel_env()

    # 获取训练数据
    train_dataset = PPASRDataset(args.train_manifest,
                                 args.dataset_vocab,
                                 mean_std_filepath=args.mean_std_path,
                                 min_duration=args.min_duration,
                                 max_duration=args.max_duration)
    batch_sampler = paddle.io.DistributedBatchSampler(
        train_dataset, batch_size=args.batch_size, shuffle=True)
    train_loader = DataLoader(dataset=train_dataset,
                              collate_fn=collate_fn,
                              batch_sampler=batch_sampler,
                              num_workers=args.num_workers)
    # 获取测试数据
    test_dataset = PPASRDataset(args.test_manifest,
                                args.dataset_vocab,
                                mean_std_filepath=args.mean_std_path)
    batch_sampler = paddle.io.BatchSampler(test_dataset,
                                           batch_size=args.batch_size)
    test_loader = DataLoader(dataset=test_dataset,
                             collate_fn=collate_fn,
                             batch_sampler=batch_sampler,
                             num_workers=args.num_workers)

    # 获取模型
    model = DeepSpeech2Model(feat_size=train_dataset.feature_dim,
                             dict_size=len(train_dataset.vocabulary),
                             num_conv_layers=args.num_conv_layers,
                             num_rnn_layers=args.num_rnn_layers,
                             rnn_size=args.rnn_layer_size)
    if dist.get_rank() == 0:
        print('input_size的第三个参数是变长的,这里为了能查看输出的大小变化,指定了一个值!')
        paddle.summary(model,
                       input_size=[(None, train_dataset.feature_dim, 970),
                                   (None, )],
                       dtypes=[paddle.float32, paddle.int64])

    # 设置支持多卡训练
    if len(args.gpus.split(',')) > 1:
        model = paddle.DataParallel(model)

    # 设置优化方法
    clip = paddle.nn.ClipGradByNorm(clip_norm=3.0)
    # 获取预训练的epoch数
    last_epoch = int(re.findall(
        r'\d+', args.resume)[-1]) if args.resume is not None else 0
    scheduler = paddle.optimizer.lr.ExponentialDecay(
        learning_rate=args.learning_rate,
        gamma=0.83,
        last_epoch=last_epoch,
        verbose=True)
    optimizer = paddle.optimizer.Adam(
        parameters=model.parameters(),
        learning_rate=scheduler,
        weight_decay=paddle.regularizer.L2Decay(1e-06),
        grad_clip=clip)

    # 获取损失函数
    ctc_loss = paddle.nn.CTCLoss()

    # 加载预训练模型
    if args.pretrained_model is not None:
        model_dict = model.state_dict()
        model_state_dict = paddle.load(
            os.path.join(args.pretrained_model, 'model.pdparams'))
        # 特征层
        for name, weight in model_dict.items():
            if name in model_state_dict.keys():
                if weight.shape != list(model_state_dict[name].shape):
                    print('{} not used, shape {} unmatched with {} in model.'.
                          format(name, list(model_state_dict[name].shape),
                                 weight.shape))
                    model_state_dict.pop(name, None)
            else:
                print('Lack weight: {}'.format(name))
        model.set_dict(model_state_dict)
        print('成功加载预训练模型')

    # 加载预训练模型
    if args.resume is not None:
        model.set_state_dict(
            paddle.load(os.path.join(args.resume, 'model.pdparams')))
        optimizer.set_state_dict(
            paddle.load(os.path.join(args.resume, 'optimizer.pdopt')))
        print('成功恢复模型参数和优化方法参数')

    train_step = 0
    test_step = 0
    # 开始训练
    for epoch in range(last_epoch, args.num_epoch):
        for batch_id, (inputs, labels, input_lens,
                       label_lens) in enumerate(train_loader()):

            out, out_lens = model(inputs, input_lens)
            out = paddle.transpose(out, perm=[1, 0, 2])

            # 计算损失
            loss = ctc_loss(out, labels, out_lens, label_lens)
            loss.backward()
            optimizer.step()
            optimizer.clear_grad()

            # 多卡训练只使用一个进程打印
            if batch_id % 100 == 0 and dist.get_rank() == 0:
                print('[%s] Train epoch %d, batch %d, loss: %f' %
                      (datetime.now(), epoch, batch_id, loss))
                writer.add_scalar('Train loss', loss, train_step)
                train_step += 1

            # 固定步数也要保存一次模型
            if batch_id % 2000 == 0 and batch_id != 0 and dist.get_rank() == 0:
                # 保存模型
                save_model(args=args,
                           epoch=epoch,
                           model=model,
                           optimizer=optimizer)

        # 多卡训练只使用一个进程执行评估和保存模型
        if dist.get_rank() == 0:
            # 执行评估
            model.eval()
            c = evaluate(model, test_loader, test_dataset.vocabulary)
            print('\n', '=' * 70)
            print('[%s] Test epoch %d, cer: %f' % (datetime.now(), epoch, c))
            print('=' * 70)
            writer.add_scalar('Test cer', c, test_step)
            test_step += 1
            model.train()

            # 记录学习率
            writer.add_scalar('Learning rate', scheduler.last_lr, epoch)

            # 保存模型
            save_model(args=args,
                       epoch=epoch,
                       model=model,
                       optimizer=optimizer)
        scheduler.step()
예제 #2
0
def create_data_loader(args, places=None):
    if args.train_file is not None and args.dev_file is not None:
        datasets = load_dataset('wmt14ende',
                                data_files=[args.train_file, args.dev_file],
                                splits=('train', 'dev'))
    elif args.train_file is None and args.dev_file is None:
        datasets = load_dataset('wmt14ende', splits=('train', 'dev'))
    else:
        raise ValueError(
            "--train_file and --dev_file must be both or neither set. ")

    if args.vocab_file is not None:
        src_vocab = Vocab.load_vocabulary(filepath=args.vocab_file,
                                          unk_token=args.unk_token,
                                          bos_token=args.bos_token,
                                          eos_token=args.eos_token)
    elif not args.benchmark:
        src_vocab = Vocab.load_vocabulary(**datasets[0].vocab_info["bpe"])
    else:
        src_vocab = Vocab.load_vocabulary(
            **datasets[0].vocab_info["benchmark"])
    trg_vocab = src_vocab

    padding_vocab = (
        lambda x:
        (x + args.pad_factor - 1) // args.pad_factor * args.pad_factor)
    args.src_vocab_size = padding_vocab(len(src_vocab))
    args.trg_vocab_size = padding_vocab(len(trg_vocab))

    def convert_samples(sample):
        source = sample[args.src_lang].split()
        target = sample[args.trg_lang].split()

        source = src_vocab.to_indices(source)
        target = trg_vocab.to_indices(target)

        return source, target

    data_loaders = [(None)] * 2
    for i, dataset in enumerate(datasets):
        dataset = dataset.map(convert_samples, lazy=False).filter(
            partial(min_max_filer, max_len=args.max_length))
        batch_sampler = TransformerBatchSampler(
            dataset=dataset,
            batch_size=args.batch_size,
            pool_size=args.pool_size,
            sort_type=args.sort_type,
            shuffle=args.shuffle,
            shuffle_batch=args.shuffle_batch,
            use_token_batch=True,
            max_length=args.max_length,
            distribute_mode=True if i == 0 else False,
            world_size=dist.get_world_size(),
            rank=dist.get_rank(),
            pad_seq=args.pad_seq,
            bsz_multi=args.bsz_multi)

        data_loader = DataLoader(dataset=dataset,
                                 places=places,
                                 batch_sampler=batch_sampler,
                                 collate_fn=partial(prepare_train_input,
                                                    bos_idx=args.bos_idx,
                                                    eos_idx=args.eos_idx,
                                                    pad_idx=args.bos_idx,
                                                    pad_seq=args.pad_seq,
                                                    dtype=args.input_dtype),
                                 num_workers=args.num_workers)
        data_loaders[i] = (data_loader)
    return data_loaders
예제 #3
0
def do_train(args):
    paddle.set_device("gpu" if args.n_gpu else "cpu")
    if paddle.distributed.get_world_size() > 1:
        paddle.distributed.init_parallel_env()

    # Create dataset, tokenizer and dataloader.
    train_ds, test_ds = load_dataset('msra_ner',
                                     splits=('train', 'test'),
                                     lazy=False)

    tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path)

    label_list = train_ds.label_list
    label_num = len(label_list)
    no_entity_id = label_num - 1

    trans_func = partial(tokenize_and_align_labels,
                         tokenizer=tokenizer,
                         no_entity_id=no_entity_id,
                         max_seq_len=args.max_seq_length)

    train_ds = train_ds.map(trans_func)

    ignore_label = -100

    batchify_fn = lambda samples, fn=Dict(
        {
            'input_ids': Pad(axis=0, pad_val=tokenizer.pad_token_id),  # input
            'token_type_ids': Pad(axis=0, pad_val=tokenizer.pad_token_type_id
                                  ),  # segment
            'seq_len': Stack(),  # seq_len
            'labels': Pad(axis=0, pad_val=ignore_label)  # label
        }): fn(samples)

    train_batch_sampler = paddle.io.DistributedBatchSampler(
        train_ds, batch_size=args.batch_size, shuffle=True, drop_last=True)

    train_data_loader = DataLoader(dataset=train_ds,
                                   collate_fn=batchify_fn,
                                   num_workers=0,
                                   batch_sampler=train_batch_sampler,
                                   return_list=True)

    test_ds = test_ds.map(trans_func)

    test_data_loader = DataLoader(dataset=test_ds,
                                  collate_fn=batchify_fn,
                                  num_workers=0,
                                  batch_size=args.batch_size,
                                  return_list=True)

    # Define the model netword and its loss
    model = BertForTokenClassification.from_pretrained(args.model_name_or_path,
                                                       num_classes=label_num)
    if paddle.distributed.get_world_size() > 1:
        model = paddle.DataParallel(model)

    num_training_steps = args.max_steps if args.max_steps > 0 else len(
        train_data_loader) * args.num_train_epochs

    lr_scheduler = LinearDecayWithWarmup(args.learning_rate,
                                         num_training_steps, args.warmup_steps)

    # Generate parameter names needed to perform weight decay.
    # All bias and LayerNorm parameters are excluded.
    decay_params = [
        p.name for n, p in model.named_parameters()
        if not any(nd in n for nd in ["bias", "norm"])
    ]
    optimizer = paddle.optimizer.AdamW(
        learning_rate=lr_scheduler,
        epsilon=args.adam_epsilon,
        parameters=model.parameters(),
        weight_decay=args.weight_decay,
        apply_decay_param_fun=lambda x: x in decay_params)

    loss_fct = paddle.nn.loss.CrossEntropyLoss(ignore_index=ignore_label)

    metric = ChunkEvaluator(label_list=label_list)

    global_step = 0
    last_step = args.num_train_epochs * len(train_data_loader)
    tic_train = time.time()
    for epoch in range(args.num_train_epochs):
        for step, batch in enumerate(train_data_loader):
            global_step += 1
            input_ids, token_type_ids, _, labels = batch
            logits = model(input_ids, token_type_ids)
            loss = loss_fct(logits, labels)
            avg_loss = paddle.mean(loss)
            if global_step % args.logging_steps == 0:
                print(
                    "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s"
                    % (global_step, epoch, step, avg_loss, args.logging_steps /
                       (time.time() - tic_train)))
                tic_train = time.time()
            avg_loss.backward()
            optimizer.step()
            lr_scheduler.step()
            optimizer.clear_grad()
            if global_step % args.save_steps == 0 or global_step == last_step:
                if (not args.n_gpu > 1) or paddle.distributed.get_rank() == 0:
                    evaluate(model, loss_fct, metric, test_data_loader,
                             label_num)
                    paddle.save(
                        model.state_dict(),
                        os.path.join(args.output_dir,
                                     "model_%d.pdparams" % global_step))
예제 #4
0
def do_train(args):
    paddle.enable_static() if not args.eager_run else None
    paddle.set_device("gpu" if args.n_gpu else "cpu")
    if paddle.distributed.get_world_size() > 1:
        paddle.distributed.init_parallel_env()

    set_seed(args)

    args.task_name = args.task_name.lower()
    dataset_class, metric_class = TASK_CLASSES[args.task_name]
    args.model_type = args.model_type.lower()
    model_class, tokenizer_class = MODEL_CLASSES[args.model_type]

    train_dataset, dev_dataset = dataset_class.get_datasets(["train", "dev"])
    tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path)

    trans_func = partial(convert_example,
                         tokenizer=tokenizer,
                         label_list=train_dataset.get_labels(),
                         max_seq_length=args.max_seq_length)
    train_dataset = train_dataset.apply(trans_func, lazy=True)
    # train_batch_sampler = SamplerHelper(train_dataset).shuffle().batch(
    #     batch_size=args.batch_size).shard()
    train_batch_sampler = paddle.io.DistributedBatchSampler(
        # train_dataset, batch_size=args.batch_size, shuffle=True)
        train_dataset,
        batch_size=args.batch_size,
        shuffle=False)
    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token]),  # input
        Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token]),  # segment
        Stack(),  # length
        Stack(dtype="int64"
              if train_dataset.get_labels() else "float32")  # label
    ): [data for i, data in enumerate(fn(samples)) if i != 2]
    train_data_loader = DataLoader(dataset=train_dataset,
                                   batch_sampler=train_batch_sampler,
                                   collate_fn=batchify_fn,
                                   num_workers=0,
                                   return_list=True)
    dev_dataset = dev_dataset.apply(trans_func, lazy=True)
    # dev_batch_sampler = SamplerHelper(dev_dataset).batch(
    #     batch_size=args.batch_size)
    dev_batch_sampler = paddle.io.BatchSampler(dev_dataset,
                                               batch_size=args.batch_size,
                                               shuffle=False)
    dev_data_loader = DataLoader(dataset=dev_dataset,
                                 batch_sampler=dev_batch_sampler,
                                 collate_fn=batchify_fn,
                                 num_workers=0,
                                 return_list=True)

    # model = model_class.from_pretrained(
    #     args.model_name_or_path,) num_classes=len(train_dataset.get_labels()))
    model = BertForPreTraining(
        BertModel(**model_class.pretrained_init_configuration[
            args.model_name_or_path]))
    if paddle.distributed.get_world_size() > 1:
        model = paddle.DataParallel(model)

    lr_scheduler = paddle.optimizer.lr.LambdaDecay(
        args.learning_rate,
        lambda current_step, num_warmup_steps=args.warmup_steps,
        num_training_steps=args.max_steps if args.max_steps > 0 else
        (len(train_data_loader) * args.num_train_epochs): float(
            current_step) / float(max(1, num_warmup_steps))
        if current_step < num_warmup_steps else max(
            0.0,
            float(num_training_steps - current_step) / float(
                max(1, num_training_steps - num_warmup_steps))))

    optimizer = paddle.optimizer.AdamW(
        learning_rate=lr_scheduler,
        epsilon=args.adam_epsilon,
        parameters=model.parameters(),
        weight_decay=args.weight_decay,
        apply_decay_param_fun=lambda x: x in [
            p.name for n, p in model.named_parameters()
            if not any(nd in n for nd in ["bias", "norm"])
        ])

    loss_fct = paddle.nn.loss.CrossEntropyLoss() if train_dataset.get_labels(
    ) else paddle.nn.loss.MSELoss()

    metric = metric_class()

    ### TODO: use hapi
    # trainer = paddle.hapi.Model(model)
    # trainer.prepare(optimizer, loss_fct, paddle.metric.Accuracy())
    # trainer.fit(train_data_loader,
    #             dev_data_loader,
    #             log_freq=args.logging_steps,
    #             epochs=args.num_train_epochs,
    #             save_dir=args.output_dir)

    model.eval()
    param_names = list(model.state_dict().keys())
    import pickle
    with open(args.params_pd_path, "rb") as f:
        np_params = pickle.load(f)
    model.set_state_dict(dict(zip(param_names, np_params)))
    paddle.save(model.state_dict(), "%s.pdparams" % args.model_name_or_path)
    for data in train_data_loader():
        print(model(*data[:-1]))
        exit(0)

    global_step = 0
    tic_train = time.time()
    for epoch in range(args.num_train_epochs):
        for step, batch in enumerate(train_data_loader):
            input_ids, segment_ids, labels = batch
            logits = model(input_ids, segment_ids)
            loss = loss_fct(logits, labels)
            if global_step % args.logging_steps == 0:
                print(
                    "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s"
                    % (global_step, epoch, step, loss, args.logging_steps /
                       (time.time() - tic_train)))
                tic_train = time.time()
            loss.backward()
            optimizer.step()
            lr_scheduler.step()
            optimizer.clear_grad()
            if global_step % args.save_steps == 0:
                evaluate(model, loss_fct, metric, dev_data_loader)
                if (not args.n_gpu > 1) or paddle.distributed.get_rank() == 0:
                    paddle.save(
                        model.state_dict(),
                        os.path.join(args.output_dir,
                                     "model_%d.pdparams" % global_step))
            global_step += 1
예제 #5
0
def create_data_loader(args):
    root = None if args.root == "None" else args.root
    (src_vocab, trg_vocab) = WMT14ende.get_vocab(root=root)
    args.src_vocab_size, args.trg_vocab_size = len(src_vocab), len(trg_vocab)
    transform_func = WMT14ende.get_default_transform_func(root=root)
    datasets = [
        WMT14ende.get_datasets(mode=m, transform_func=transform_func)
        for m in ["train", "dev"]
    ]

    if args.shuffle or args.shuffle_batch:
        if args.shuffle_seed == "None" or args.shuffle_seed is None:
            shuffle_seed = 0
        else:
            shuffle_seed = args.shuffle_seed

    def _max_token_fn(current_idx, current_batch_size, tokens_sofar,
                      data_source):
        return max(tokens_sofar,
                   len(data_source[current_idx][0]) + 1,
                   len(data_source[current_idx][1]) + 1)

    def _key(size_so_far, minibatch_len):
        return size_so_far * minibatch_len

    data_loaders = [(None)] * 2
    for i, dataset in enumerate(datasets):
        m = dataset.mode
        dataset = dataset.filter(
            partial(min_max_filer, max_len=args.max_length))
        sampler = SamplerHelper(dataset)

        src_key = (lambda x, data_source: len(data_source[x][0]) + 1)
        if args.sort_type == SortType.GLOBAL:
            buffer_size = -1
            trg_key = (lambda x, data_source: len(data_source[x][1]) + 1)
            # Sort twice
            sampler = sampler.sort(key=trg_key, buffer_size=buffer_size).sort(
                key=src_key, buffer_size=buffer_size)
        else:
            if args.shuffle:
                sampler = sampler.shuffle(seed=shuffle_seed)
            if args.sort_type == SortType.POOL:
                buffer_size = args.pool_size
                sampler = sampler.sort(key=src_key, buffer_size=buffer_size)

        batch_sampler = sampler.batch(batch_size=args.batch_size,
                                      drop_last=False,
                                      batch_size_fn=_max_token_fn,
                                      key=_key)

        if m == "train":
            batch_sampler = batch_sampler.shard()

        if args.shuffle_batch:
            batch_sampler.shuffle(seed=shuffle_seed)

        data_loader = DataLoader(dataset=dataset,
                                 batch_sampler=batch_sampler,
                                 collate_fn=partial(prepare_train_input,
                                                    bos_idx=args.bos_idx,
                                                    eos_idx=args.eos_idx,
                                                    pad_idx=args.bos_idx),
                                 num_workers=0,
                                 return_list=True)
        data_loaders[i] = (data_loader)
    return data_loaders
예제 #6
0
def main():
    paddle.enable_static() if FLAGS.dynamic else None

    if not FLAGS.eval_only:  # training mode
        train_transform = Compose([
            ColorDistort(),
            RandomExpand(),
            RandomCrop(),
            RandomFlip(),
            NormalizeBox(),
            PadBox(),
            BboxXYXY2XYWH()
        ])

        train_collate_fn = BatchCompose([RandomShape(), NormalizeImage()])
        dataset = COCODataset(dataset_dir=FLAGS.data,
                              anno_path='annotations/instances_train2017.json',
                              image_dir='train2017',
                              with_background=False,
                              mixup=True,
                              transform=train_transform)
        batch_sampler = DistributedBatchSampler(dataset,
                                                batch_size=FLAGS.batch_size,
                                                shuffle=True,
                                                drop_last=True)
        loader = DataLoader(dataset,
                            batch_sampler=batch_sampler,
                            num_workers=FLAGS.num_workers,
                            return_list=True,
                            collate_fn=train_collate_fn)
    else:  # evaluation mode
        eval_transform = Compose([
            ResizeImage(target_size=608),
            NormalizeBox(),
            PadBox(),
            BboxXYXY2XYWH()
        ])

        eval_collate_fn = BatchCompose([NormalizeImage()])
        dataset = COCODataset(dataset_dir=FLAGS.data,
                              anno_path='annotations/instances_val2017.json',
                              image_dir='val2017',
                              with_background=False,
                              transform=eval_transform)
        # batch_size can only be 1 in evaluation for YOLOv3
        # prediction bbox is a LoDTensor
        batch_sampler = DistributedBatchSampler(dataset,
                                                batch_size=1,
                                                shuffle=False,
                                                drop_last=False)
        loader = DataLoader(dataset,
                            batch_sampler=batch_sampler,
                            num_workers=FLAGS.num_workers,
                            return_list=True,
                            collate_fn=eval_collate_fn)

    pretrained = FLAGS.eval_only and FLAGS.weights is None
    model = yolov3_darknet53(num_classes=dataset.num_classes,
                             num_max_boxes=NUM_MAX_BOXES,
                             model_mode='eval' if FLAGS.eval_only else 'train',
                             pretrained=pretrained)

    if FLAGS.pretrain_weights and not FLAGS.eval_only:
        model.load(FLAGS.pretrain_weights,
                   skip_mismatch=True,
                   reset_optimizer=True)

    optim = make_optimizer(len(batch_sampler),
                           parameter_list=model.parameters())

    model.prepare(optimizer=optim,
                  loss=YoloLoss(num_classes=dataset.num_classes))

    # NOTE: we implement COCO metric of YOLOv3 model here, separately
    # from 'prepare' and 'fit' framework for follwing reason:
    # 1. YOLOv3 network structure is different between 'train' and
    # 'eval' mode, in 'eval' mode, output prediction bbox is not the
    # feature map used for YoloLoss calculating
    # 2. COCO metric behavior is also different from defined Metric
    # for COCO metric should not perform accumulate in each iteration
    # but only accumulate at the end of an epoch
    if FLAGS.eval_only:
        if FLAGS.weights is not None:
            model.load(FLAGS.weights, reset_optimizer=True)
        preds = model.predict(loader, stack_outputs=False)
        _, _, _, img_ids, bboxes = preds

        anno_path = os.path.join(FLAGS.data,
                                 'annotations/instances_val2017.json')
        coco_metric = COCOMetric(anno_path=anno_path, with_background=False)
        for img_id, bbox in zip(img_ids, bboxes):
            coco_metric.update(img_id, bbox)
        coco_metric.accumulate()
        coco_metric.reset()
        return

    if FLAGS.resume is not None:
        model.load(FLAGS.resume)

    save_dir = FLAGS.save_dir or 'yolo_checkpoint'

    model.fit(train_data=loader,
              epochs=FLAGS.epoch - FLAGS.no_mixup_epoch,
              save_dir=os.path.join(save_dir, "mixup"),
              save_freq=10)

    # do not use image mixup transfrom in the last FLAGS.no_mixup_epoch epoches
    dataset.mixup = False
    model.fit(train_data=loader,
              epochs=FLAGS.no_mixup_epoch,
              save_dir=os.path.join(save_dir, "no_mixup"),
              save_freq=5)
예제 #7
0
    def __getitem__(self, idx):
        data = self.segment[idx]
        with data.open() as fp:
            image_tensor = self.transform(Image.open(fp))

        return image_tensor, self.category_to_index[
            data.label.classification.category]
        # """"""


"""Build a dataloader and run it"""
# Please visit `https://gas.graviti.cn/tensorbay/developer` to get the AccessKey.
ACCESS_KEY = "<YOUR_ACCESSKEY>"

to_tensor = transforms.ToTensor()
normalization = transforms.Normalize(mean=[0.485], std=[0.229])
my_transforms = transforms.Compose([to_tensor, normalization])

train_segment = DogsVsCatsSegment(GAS(ACCESS_KEY),
                                  segment_name="train",
                                  transform=my_transforms)
train_dataloader = DataLoader(train_segment,
                              batch_size=4,
                              shuffle=True,
                              num_workers=0)

for index, (image, label) in enumerate(train_dataloader):
    print(f"{index}: {label}")
""""""
예제 #8
0
def predict2file(args):
    if args.task_name == "mnli":
        test_ds_matched, test_ds_mismatched = load_dataset(
            "glue", "mnli", splits=["test_matched", "test_mismatched"])
        id2label = dict(
            zip(range(len(test_ds_matched.label_list)),
                test_ds_matched.label_list))
    else:
        test_ds = load_dataset("glue", args.task_name, splits="test")
        if test_ds.label_list is not None:
            id2label = dict(
                zip(range(len(test_ds.label_list)), test_ds.label_list))
        else:
            id2label = None

    model = MPNetForSequenceClassification.from_pretrained(args.ckpt_path)
    model.eval()
    tokenizer = MPNetTokenizer.from_pretrained(args.ckpt_path)
    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=tokenizer.pad_token_id),
        Pad(axis=0, pad_val=tokenizer.pad_token_type_id),
    ): fn(samples)

    trans_func = partial(
        convert_example,
        tokenizer=tokenizer,
        label_list=None,
        max_seq_length=args.max_seq_length,
        is_test=True,
    )

    if args.task_name == "mnli":
        test_ds_matched = test_ds_matched.map(trans_func, lazy=True)
        test_ds_mismatched = test_ds_mismatched.map(trans_func, lazy=True)
        test_batch_sampler_matched = paddle.io.BatchSampler(
            test_ds_matched, batch_size=args.batch_size, shuffle=False)
        test_data_loader_matched = DataLoader(
            dataset=test_ds_matched,
            batch_sampler=test_batch_sampler_matched,
            collate_fn=batchify_fn,
            num_workers=2,
            return_list=True,
        )
        test_batch_sampler_mismatched = paddle.io.BatchSampler(
            test_ds_mismatched, batch_size=args.batch_size, shuffle=False)
        test_data_loader_mismatched = DataLoader(
            dataset=test_ds_mismatched,
            batch_sampler=test_batch_sampler_mismatched,
            collate_fn=batchify_fn,
            num_workers=2,
            return_list=True,
        )
        file_m = os.path.join("template", task2filename[args.task_name][0])
        file_mm = os.path.join("template", task2filename[args.task_name][1])
        matched_outputs = predict(test_data_loader_matched, model, id2label)
        mismatched_outputs = predict(test_data_loader_mismatched, model,
                                     id2label)
        writetsv(matched_outputs, file_m)
        writetsv(mismatched_outputs, file_mm)
    else:
        test_ds = test_ds.map(trans_func, lazy=True)
        test_batch_sampler = paddle.io.BatchSampler(test_ds,
                                                    batch_size=args.batch_size,
                                                    shuffle=False)
        test_data_loader = DataLoader(
            dataset=test_ds,
            batch_sampler=test_batch_sampler,
            collate_fn=batchify_fn,
            num_workers=2,
            return_list=True,
        )
        predict_outputs = predict(test_data_loader, model, id2label)

        file = os.path.join("template", task2filename[args.task_name])
        writetsv(predict_outputs, file)
예제 #9
0
def do_train(args):
    paddle.set_device(args.device)
    if paddle.distributed.get_world_size() > 1:
        paddle.distributed.init_parallel_env()

    set_seed(args)

    args.task_name = args.task_name.lower()
    metric_class = METRIC_CLASSES[args.task_name]
    args.model_type = args.model_type.lower()
    model_class, tokenizer_class = MODEL_CLASSES[args.model_type]

    train_ds, dev_ds = load_dataset('clue',
                                    args.task_name,
                                    splits=('train', 'dev'))

    tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path)

    trans_func = partial(convert_example,
                         label_list=train_ds.label_list,
                         tokenizer=tokenizer,
                         max_seq_length=args.max_seq_length)

    train_ds = train_ds.map(trans_func, lazy=True)
    train_batch_sampler = paddle.io.DistributedBatchSampler(
        train_ds, batch_size=args.batch_size, shuffle=True)

    dev_ds = dev_ds.map(trans_func, lazy=True)
    dev_batch_sampler = paddle.io.BatchSampler(dev_ds,
                                               batch_size=args.batch_size,
                                               shuffle=False)

    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=tokenizer.pad_token_id),  # input
        Pad(axis=0, pad_val=tokenizer.pad_token_type_id),  # segment
        Stack(dtype="int64" if train_ds.label_list else "float32")  # label
    ): fn(samples)

    train_data_loader = DataLoader(dataset=train_ds,
                                   batch_sampler=train_batch_sampler,
                                   collate_fn=batchify_fn,
                                   num_workers=0,
                                   return_list=True)
    dev_data_loader = DataLoader(dataset=dev_ds,
                                 batch_sampler=dev_batch_sampler,
                                 collate_fn=batchify_fn,
                                 num_workers=0,
                                 return_list=True)

    num_classes = 1 if train_ds.label_list == None else len(
        train_ds.label_list)
    model = model_class.from_pretrained(args.model_name_or_path,
                                        num_classes=num_classes)
    if paddle.distributed.get_world_size() > 1:
        model = paddle.DataParallel(model)

    if args.max_steps > 0:
        num_training_steps = args.max_steps
        num_train_epochs = math.ceil(num_training_steps /
                                     len(train_data_loader))
    else:
        num_training_steps = len(train_data_loader) * args.num_train_epochs
        num_train_epochs = args.num_train_epochs

    warmup = args.warmup_steps if args.warmup_steps > 0 else args.warmup_proportion

    lr_scheduler = LinearDecayWithWarmup(args.learning_rate,
                                         num_training_steps, warmup)

    # Generate parameter names needed to perform weight decay.
    # All bias and LayerNorm parameters are excluded.
    decay_params = [
        p.name for n, p in model.named_parameters()
        if not any(nd in n for nd in ["bias", "norm"])
    ]
    optimizer = paddle.optimizer.AdamW(
        learning_rate=lr_scheduler,
        beta1=0.9,
        beta2=0.999,
        epsilon=args.adam_epsilon,
        parameters=model.parameters(),
        weight_decay=args.weight_decay,
        apply_decay_param_fun=lambda x: x in decay_params,
        grad_clip=nn.ClipGradByGlobalNorm(args.max_grad_norm))

    loss_fct = paddle.nn.loss.CrossEntropyLoss(
    ) if train_ds.label_list else paddle.nn.loss.MSELoss()

    metric = metric_class()
    best_acc = 0.0
    global_step = 0
    tic_train = time.time()
    for epoch in range(num_train_epochs):
        for step, batch in enumerate(train_data_loader):
            global_step += 1
            input_ids, segment_ids, labels = batch
            logits = model(input_ids, segment_ids)
            loss = loss_fct(logits, labels)
            loss.backward()
            optimizer.step()
            lr_scheduler.step()
            optimizer.clear_grad()
            if global_step % args.logging_steps == 0:
                print(
                    "global step %d/%d, epoch: %d, batch: %d, rank_id: %s, loss: %f, lr: %.10f, speed: %.4f step/s"
                    % (global_step, num_training_steps, epoch, step,
                       paddle.distributed.get_rank(), loss, optimizer.get_lr(),
                       args.logging_steps / (time.time() - tic_train)))
                tic_train = time.time()
            if global_step % args.save_steps == 0 or global_step == num_training_steps:
                tic_eval = time.time()
                acc = evaluate(model, loss_fct, metric, dev_data_loader)
                print("eval done total : %s s" % (time.time() - tic_eval))
                if acc > best_acc:
                    best_acc = acc
                    output_dir = args.output_dir
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                    # Need better way to get inner model of DataParallel
                    model_to_save = model._layers if isinstance(
                        model, paddle.DataParallel) else model
                    model_to_save.save_pretrained(output_dir)
                    tokenizer.save_pretrained(output_dir)
            if global_step >= num_training_steps:
                print("best_acc: ", best_acc)
                return
    print("best_acc: ", best_acc)
예제 #10
0
 def prepare_dataloader(dataset):
     return DataLoader(dataset,
                       places=places,
                       num_workers=num_workers,
                       batch_size=1,
                       drop_last=True)
    def test_main(self):
        place = fluid.cpu_places()[0]
        with fluid.dygraph.guard(place):
            dataset = RandomDataset(100)
            batch_sampler = BatchSampler(dataset=dataset, batch_size=4)

            # dataset is not instance of Dataset
            try:
                loader = DataLoader(dataset=batch_sampler, places=place)
                self.assertTrue(False)
            except AssertionError:
                pass

            # places is None
            try:
                loader = DataLoader(dataset=dataset, places=None)
                self.assertTrue(False)
            except AssertionError:
                pass

            # num_workers < 0
            try:
                loader = DataLoader(dataset=dataset,
                                    places=place,
                                    num_workers=-1)
                self.assertTrue(False)
            except AssertionError:
                pass

            # timeout < 0
            try:
                loader = DataLoader(dataset=dataset, places=place, timeout=-1)
                self.assertTrue(False)
            except AssertionError:
                pass

            # batch_sampler is not instance of BatchSampler
            try:
                loader = DataLoader(dataset=dataset,
                                    places=place,
                                    batch_sampler=dataset)
                self.assertTrue(False)
            except AssertionError:
                pass

            # set batch_sampler and shuffle/batch_size/drop_last
            try:
                loader = DataLoader(dataset=dataset,
                                    places=place,
                                    batch_sampler=batch_sampler,
                                    shuffle=True,
                                    drop_last=True)
                self.assertTrue(False)
            except AssertionError:
                pass

            # set batch_sampler correctly
            try:
                loader = DataLoader(dataset=dataset,
                                    places=place,
                                    batch_sampler=batch_sampler)
                self.assertTrue(True)
            except AssertionError:
                self.assertTrue(False)
예제 #12
0
def do_train(args):
    paddle.set_device(args.device)
    if paddle.distributed.get_world_size() > 1:
        paddle.distributed.init_parallel_env()

    set_seed(args)

    args.task_name = args.task_name.lower()
    metric_class = METRIC_CLASSES[args.task_name]
    args.model_type = args.model_type.lower()
    model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
    train_ds = load_dataset('clue', args.task_name, splits='train')
    tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path)

    trans_func = partial(convert_example,
                         label_list=train_ds.label_list,
                         tokenizer=tokenizer,
                         max_seq_length=args.max_seq_length)
    train_ds = train_ds.map(trans_func, lazy=True)
    train_batch_sampler = paddle.io.DistributedBatchSampler(
        train_ds, batch_size=args.batch_size, shuffle=True)
    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=tokenizer.pad_token_id),  # input
        Pad(axis=0, pad_val=tokenizer.pad_token_type_id),  # segment
        Stack(dtype="int64" if train_ds.label_list else "float32")  # label
    ): fn(samples)

    train_data_loader = DataLoader(dataset=train_ds,
                                   batch_sampler=train_batch_sampler,
                                   collate_fn=batchify_fn,
                                   num_workers=0,
                                   return_list=True)

    dev_ds = load_dataset('clue', args.task_name, splits='dev')
    dev_ds = dev_ds.map(trans_func, lazy=True)
    dev_batch_sampler = paddle.io.BatchSampler(dev_ds,
                                               batch_size=args.batch_size,
                                               shuffle=False)
    dev_data_loader = DataLoader(dataset=dev_ds,
                                 batch_sampler=dev_batch_sampler,
                                 collate_fn=batchify_fn,
                                 num_workers=0,
                                 return_list=True)
    num_labels = 1 if train_ds.label_list == None else len(train_ds.label_list)

    model = model_class.from_pretrained(args.model_name_or_path,
                                        num_classes=num_labels)

    # Step1: Initialize a dictionary to save the weights from the origin PPMiniLM model.
    origin_weights = model.state_dict()

    # Step2: Convert origin model to supernet.
    sp_config = supernet(expand_ratio=[1.0])
    model = Convert(sp_config).convert(model)
    # Use weights saved in the dictionary to initialize supernet.
    utils.set_state_dict(model, origin_weights)
    del origin_weights

    super_sd = paddle.load(
        os.path.join(args.model_name_or_path, 'model_state.pdparams'))
    model.set_state_dict(super_sd)

    # Step3: Define teacher model.
    teacher_model = model_class.from_pretrained(args.model_name_or_path,
                                                num_classes=num_labels)

    # Step4: Config about distillation.
    mapping_layers = ['ppminilm.embeddings']
    for idx in range(model.ppminilm.config['num_hidden_layers']):
        mapping_layers.append('ppminilm.encoder.layers.{}'.format(idx))

    default_distill_config = {
        'lambda_distill': 0.1,
        'teacher_model': teacher_model,
        'mapping_layers': mapping_layers,
    }
    distill_config = DistillConfig(**default_distill_config)

    # Step5: Config in supernet training.
    ofa_model = OFA(model,
                    distill_config=distill_config,
                    elastic_order=['width'])

    criterion = paddle.nn.loss.CrossEntropyLoss(
    ) if train_ds.label_list else paddle.nn.loss.MSELoss()

    metric = metric_class()

    #### Step6: Calculate the importance of neurons and head,
    #### and then reorder them according to the importance.
    head_importance, neuron_importance = nlp_utils.compute_neuron_head_importance(
        args.task_name,
        ofa_model.model,
        dev_data_loader,
        loss_fct=criterion,
        num_layers=model.ppminilm.config['num_hidden_layers'],
        num_heads=model.ppminilm.config['num_attention_heads'])
    reorder_neuron_head(ofa_model.model, head_importance, neuron_importance)

    if paddle.distributed.get_world_size() > 1:
        ofa_model.model = paddle.DataParallel(ofa_model.model)

    if args.max_steps > 0:
        num_training_steps = args.max_steps
        num_train_epochs = math.ceil(num_training_steps /
                                     len(train_data_loader))
    else:
        num_training_steps = len(train_data_loader) * args.num_train_epochs
        num_train_epochs = args.num_train_epochs

    warmup = args.warmup_steps if args.warmup_steps > 0 else args.warmup_proportion

    lr_scheduler = LinearDecayWithWarmup(args.learning_rate,
                                         num_training_steps, warmup)

    # Generate parameter names needed to perform weight decay.
    # All bias and LayerNorm parameters are excluded.
    decay_params = [
        p.name for n, p in model.named_parameters()
        if not any(nd in n for nd in ["bias", "norm"])
    ]

    optimizer = paddle.optimizer.AdamW(
        learning_rate=lr_scheduler,
        beta1=0.9,
        beta2=0.999,
        epsilon=args.adam_epsilon,
        parameters=model.parameters(),
        weight_decay=args.weight_decay,
        apply_decay_param_fun=lambda x: x in decay_params,
        grad_clip=nn.ClipGradByGlobalNorm(args.max_grad_norm))

    global_step = 0
    tic_train = time.time()
    best_res = 0.0
    for epoch in range(num_train_epochs):
        # Step7: Set current epoch and task.
        ofa_model.set_epoch(epoch)
        ofa_model.set_task('width')

        for step, batch in enumerate(train_data_loader):
            global_step += 1
            input_ids, segment_ids, labels = batch

            for width_mult in args.width_mult_list:
                # Step8: Broadcast supernet config from width_mult,
                # and use this config in supernet training.
                net_config = utils.dynabert_config(ofa_model, width_mult)
                ofa_model.set_net_config(net_config)
                logits, teacher_logits = ofa_model(input_ids,
                                                   segment_ids,
                                                   attention_mask=[None, None])
                rep_loss = ofa_model.calc_distill_loss()
                logit_loss = soft_cross_entropy(logits,
                                                teacher_logits.detach())
                loss = rep_loss + args.lambda_logit * logit_loss
                loss.backward()
            optimizer.step()
            lr_scheduler.step()
            optimizer.clear_grad()

            if global_step % args.logging_steps == 0:
                logger.info(
                    "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s"
                    % (global_step, epoch, step, loss, args.logging_steps /
                       (time.time() - tic_train)))
                tic_train = time.time()

            if global_step % args.save_steps == 0 or global_step == num_training_steps:
                tic_eval = time.time()
                evaluate(teacher_model,
                         metric,
                         dev_data_loader,
                         width_mult=100)
                print("eval done total : %s s" % (time.time() - tic_eval))
                for idx, width_mult in enumerate(args.width_mult_list):
                    net_config = utils.dynabert_config(ofa_model, width_mult)
                    ofa_model.set_net_config(net_config)
                    tic_eval = time.time()
                    res = evaluate(ofa_model, metric, dev_data_loader,
                                   width_mult)
                    print("eval done total : %s s" % (time.time() - tic_eval))

                    if best_res < res:
                        output_dir = args.output_dir
                        if not os.path.exists(output_dir):
                            os.makedirs(output_dir)
                        # need better way to get inner model of DataParallel
                        model_to_save = model._layers if isinstance(
                            model, paddle.DataParallel) else model
                        model_to_save.save_pretrained(output_dir)
                        tokenizer.save_pretrained(output_dir)
                        best_res = res
            if global_step >= num_training_steps:
                print("best_res: ", best_res)
                return
    print("best_res: ", best_res)
예제 #13
0
def do_train():
    paddle.set_device(args.device)
    rank = paddle.distributed.get_rank()
    if paddle.distributed.get_world_size() > 1:
        paddle.distributed.init_parallel_env()

    # Reads label_map.
    label_map_path = os.path.join(args.data_path, "predicate2id.json")
    if not (os.path.exists(label_map_path) and os.path.isfile(label_map_path)):
        sys.exit("{} dose not exists or is not a file.".format(label_map_path))
    with open(label_map_path, 'r', encoding='utf8') as fp:
        label_map = json.load(fp)
    num_classes = (len(label_map.keys()) - 2) * 2 + 2

    # Loads pretrained model ERNIE
    model = ErnieForTokenClassification.from_pretrained(
        "ernie-1.0", num_classes=num_classes)
    model = paddle.DataParallel(model)
    tokenizer = ErnieTokenizer.from_pretrained("ernie-1.0")
    criterion = BCELossForDuIE()

    # Loads dataset.
    train_dataset = DuIEDataset.from_file(
        os.path.join(args.data_path, 'train_data.json'), tokenizer,
        args.max_seq_length, True)
    train_batch_sampler = paddle.io.DistributedBatchSampler(
        train_dataset,
        batch_size=args.batch_size,
        shuffle=True,
        drop_last=True)
    collator = DataCollator()
    train_data_loader = DataLoader(dataset=train_dataset,
                                   batch_sampler=train_batch_sampler,
                                   collate_fn=collator,
                                   return_list=True)
    eval_file_path = os.path.join(args.data_path, 'dev_data.json')
    test_dataset = DuIEDataset.from_file(eval_file_path, tokenizer,
                                         args.max_seq_length, True)
    test_batch_sampler = paddle.io.BatchSampler(test_dataset,
                                                batch_size=args.batch_size,
                                                shuffle=False,
                                                drop_last=True)
    test_data_loader = DataLoader(dataset=test_dataset,
                                  batch_sampler=test_batch_sampler,
                                  collate_fn=collator,
                                  return_list=True)

    # Defines learning rate strategy.
    steps_by_epoch = len(train_data_loader)
    num_training_steps = steps_by_epoch * args.num_train_epochs
    lr_scheduler = LinearDecayWithWarmup(args.learning_rate,
                                         num_training_steps, args.warmup_ratio)
    # Generate parameter names needed to perform weight decay.
    # All bias and LayerNorm parameters are excluded.
    decay_params = [
        p.name for n, p in model.named_parameters()
        if not any(nd in n for nd in ["bias", "norm"])
    ]
    optimizer = paddle.optimizer.AdamW(
        learning_rate=lr_scheduler,
        parameters=model.parameters(),
        weight_decay=args.weight_decay,
        apply_decay_param_fun=lambda x: x in decay_params)

    # Starts training.
    global_step = 0
    logging_steps = 50
    save_steps = 10000
    tic_train = time.time()
    for epoch in range(args.num_train_epochs):
        print("\n=====start training of %d epochs=====" % epoch)
        tic_epoch = time.time()
        model.train()
        for step, batch in enumerate(train_data_loader):
            input_ids, seq_lens, tok_to_orig_start_index, tok_to_orig_end_index, labels = batch
            logits = model(input_ids=input_ids)
            mask = (input_ids != 0).logical_and((input_ids != 1)).logical_and(
                (input_ids != 2))
            loss = criterion(logits, labels, mask)
            loss.backward()
            optimizer.step()
            lr_scheduler.step()
            optimizer.clear_grad()
            loss_item = loss.numpy().item()
            global_step += 1

            if global_step % logging_steps == 0 and rank == 0:
                print(
                    "epoch: %d / %d, steps: %d / %d, loss: %f, speed: %.2f step/s"
                    % (epoch, args.num_train_epochs, step, steps_by_epoch,
                       loss_item, logging_steps / (time.time() - tic_train)))
                tic_train = time.time()

            if global_step % save_steps == 0 and rank == 0:
                print("\n=====start evaluating ckpt of %d steps=====" %
                      global_step)
                precision, recall, f1 = evaluate(model, criterion,
                                                 test_data_loader,
                                                 eval_file_path, "eval")
                print("precision: %.2f\t recall: %.2f\t f1: %.2f\t" %
                      (100 * precision, 100 * recall, 100 * f1))
                print("saving checkpoing model_%d.pdparams to %s " %
                      (global_step, args.output_dir))
                paddle.save(
                    model.state_dict(),
                    os.path.join(args.output_dir,
                                 "model_%d.pdparams" % global_step))
                model.train()  # back to train mode

        tic_epoch = time.time() - tic_epoch
        print("epoch time footprint: %d hour %d min %d sec" %
              (tic_epoch // 3600, (tic_epoch % 3600) // 60, tic_epoch % 60))

    # Does final evaluation.
    if rank == 0:
        print("\n=====start evaluating last ckpt of %d steps=====" %
              global_step)
        precision, recall, f1 = evaluate(model, criterion, test_data_loader,
                                         eval_file_path, "eval")
        print("precision: %.2f\t recall: %.2f\t f1: %.2f\t" %
              (100 * precision, 100 * recall, 100 * f1))
        paddle.save(
            model.state_dict(),
            os.path.join(args.output_dir, "model_%d.pdparams" % global_step))
        print("\n=====training complete=====")
예제 #14
0
def do_eval(args):
    paddle.set_device(args.device)

    # Create dataset, tokenizer and dataloader.
    train_ds, eval_ds = load_dataset('msra_ner', split=('train', 'test'))
    tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path)

    label_list = train_ds.features['ner_tags'].feature.names
    label_num = len(label_list)
    no_entity_id = 0

    def tokenize_and_align_labels(examples):
        tokenized_inputs = tokenizer(
            examples['tokens'],
            max_seq_len=args.max_seq_length,
            # We use this argument because the texts in our dataset are lists of words (with a label for each word).
            is_split_into_words=True,
            return_length=True)
        labels = []

        for i, label in enumerate(examples['ner_tags']):
            label_ids = label
            if len(tokenized_inputs['input_ids'][i]) - 2 < len(label_ids):
                label_ids = label_ids[:len(tokenized_inputs['input_ids'][i]) -
                                      2]
            label_ids = [no_entity_id] + label_ids + [no_entity_id]
            label_ids += [no_entity_id] * (
                len(tokenized_inputs['input_ids'][i]) - len(label_ids))

            labels.append(label_ids)
        tokenized_inputs["labels"] = labels
        return tokenized_inputs

    ignore_label = -100
    batchify_fn = lambda samples, fn=Dict({
        'input_ids':
        Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype='int32'),  # input
        'token_type_ids':
        Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype='int32'
            ),  # segment
        'seq_len':
        Stack(dtype='int64'),
        'labels':
        Pad(axis=0, pad_val=ignore_label, dtype='int64')  # label
    }): fn(samples)

    eval_ds = eval_ds.select(range(len(eval_ds) - 1))
    eval_ds = eval_ds.map(tokenize_and_align_labels, batched=True)
    eval_data_loader = DataLoader(dataset=eval_ds,
                                  collate_fn=batchify_fn,
                                  num_workers=0,
                                  batch_size=args.batch_size,
                                  return_list=True)

    # Define the model netword and its loss
    model = BertForTokenClassification.from_pretrained(args.model_name_or_path,
                                                       num_classes=label_num)
    if args.init_checkpoint_path:
        model_dict = paddle.load(args.init_checkpoint_path)
        model.set_dict(model_dict)
    loss_fct = paddle.nn.loss.CrossEntropyLoss(ignore_index=ignore_label)

    metric = ChunkEvaluator(label_list=label_list)

    model.eval()
    metric.reset()
    for step, batch in enumerate(eval_data_loader):
        input_ids, token_type_ids, length, labels = batch
        logits = model(input_ids, token_type_ids)
        loss = loss_fct(logits, labels)
        avg_loss = paddle.mean(loss)
        preds = logits.argmax(axis=2)
        num_infer_chunks, num_label_chunks, num_correct_chunks = metric.compute(
            length, preds, labels)
        metric.update(num_infer_chunks.numpy(), num_label_chunks.numpy(),
                      num_correct_chunks.numpy())
        precision, recall, f1_score = metric.accumulate()
    print("eval loss: %f, precision: %f, recall: %f, f1: %f" %
          (avg_loss, precision, recall, f1_score))
def train(args):
    # 设置支持多卡训练
    if len(args.gpus.split(',')) > 1:
        dist.init_parallel_env()
    if dist.get_rank() == 0:
        shutil.rmtree('log', ignore_errors=True)
        # 日志记录器
        writer = LogWriter(logdir='log')
    # 获取数据
    train_dataset = CustomDataset(args.train_root_path, is_train=False)
    # 设置支持多卡训练
    if len(args.gpus.split(',')) > 1:
        batch_sampler = paddle.io.DistributedBatchSampler(train_dataset, batch_size=args.batch_size, shuffle=True)
    else:
        batch_sampler = paddle.io.BatchSampler(train_dataset, batch_size=args.batch_size, shuffle=True)
    train_loader = DataLoader(dataset=train_dataset, batch_sampler=batch_sampler, num_workers=args.num_workers)
    print("[%s] 总数据类别为:%d" % (datetime.now(), train_dataset.num_classes))

    # 获取模型,贴心的作者同时提供了resnet的模型,以满足不同情况的使用
    if args.use_model == 'resnet_face34':
        model = resnet_face34()
    else:
        model = MobileFaceNet()
    metric_fc = ArcNet(feature_dim=512, class_dim=train_dataset.num_classes)
    if dist.get_rank() == 0:
        paddle.summary(model, input_size=(None, 3, 112, 112))

    # 设置支持多卡训练
    if len(args.gpus.split(',')) > 1:
        model = paddle.DataParallel(model)
        metric_fc = paddle.DataParallel(metric_fc)

    # 获取预训练的epoch数
    last_epoch = int(re.findall(r'\d+', args.resume)[-1]) + 1 if args.resume is not None else 0
    # 学习率衰减
    scheduler = paddle.optimizer.lr.StepDecay(learning_rate=args.learning_rate, step_size=10, gamma=0.1, last_epoch=last_epoch, verbose=True)
    # 设置优化方法
    optimizer = paddle.optimizer.Momentum(parameters=model.parameters() + metric_fc.parameters(),
                                          learning_rate=scheduler,
                                          momentum=0.9,
                                          weight_decay=paddle.regularizer.L2Decay(5e-4))

    # 加载预训练模型
    if args.pretrained_model is not None:
        model_dict = model.state_dict()
        model_state_dict = paddle.load(os.path.join(args.pretrained_model, 'model.pdparams'))
        # 特征层
        for name, weight in model_dict.items():
            if name in model_state_dict.keys():
                if weight.shape != list(model_state_dict[name].shape):
                    print('{} not used, shape {} unmatched with {} in model.'.
                          format(name, list(model_state_dict[name].shape), weight.shape))
                    model_state_dict.pop(name, None)
            else:
                print('Lack weight: {}'.format(name))
        model.set_dict(model_state_dict)
        print('[%s] Rank %d 成功加载 model 参数' % (datetime.now(), dist.get_rank()))

    # 恢复训练
    if args.resume is not None:
        model.set_state_dict(paddle.load(os.path.join(args.resume, 'model.pdparams')))
        metric_fc.set_state_dict(paddle.load(os.path.join(args.resume, 'metric_fc.pdparams')))
        optimizer.set_state_dict(paddle.load(os.path.join(args.resume, 'optimizer.pdopt')))
        print('[%s] Rank %d 成功加载模型参数和优化方法参数' % (datetime.now(), dist.get_rank()))

    # 获取损失函数
    loss = paddle.nn.CrossEntropyLoss()
    train_step = 0
    test_step = 0
    sum_batch = len(train_loader) * (args.num_epoch - last_epoch)
    # 开始训练
    for epoch in range(last_epoch, args.num_epoch):
        loss_sum = []
        accuracies = []
        for batch_id, (img, label) in enumerate(train_loader()):
            start = time.time()
            feature = model(img)
            output = metric_fc(feature, label)
            # 计算损失值
            los = loss(output, label)
            los.backward()
            optimizer.step()
            optimizer.clear_grad()
            # 计算准确率
            label = paddle.reshape(label, shape=(-1, 1))
            acc = accuracy(input=paddle.nn.functional.softmax(output), label=label)
            accuracies.append(acc.numpy()[0])
            loss_sum.append(los.numpy()[0])
            # 多卡训练只使用一个进程打印
            if batch_id % 100 == 0 and dist.get_rank() == 0:
                eta_sec = ((time.time() - start) * 1000) * (sum_batch - (epoch - last_epoch) * len(train_loader) - batch_id)
                eta_str = str(timedelta(seconds=int(eta_sec / 1000)))
                print('[%s] Train epoch %d, batch: %d/%d, loss: %f, accuracy: %f, eta: %s' % (
                    datetime.now(), epoch, batch_id, len(train_loader), sum(loss_sum) / len(loss_sum), sum(accuracies) / len(accuracies), eta_str))
                writer.add_scalar('Train loss', los, train_step)
                train_step += 1
                loss_sum = []
        # 多卡训练只使用一个进程执行评估和保存模型
        if dist.get_rank() == 0:
            print('='*70)
            acc = test(model)
            print('[%s] Test %d, accuracy: %f' % (datetime.now(), epoch, acc))
            print('='*70)
            writer.add_scalar('Test acc', acc, test_step)
            # 记录学习率
            writer.add_scalar('Learning rate', scheduler.last_lr, epoch)
            test_step += 1
            save_model(args, epoch, model, metric_fc, optimizer)
        scheduler.step()
    save_model(args, args.num_epoch, model, metric_fc, optimizer)
예제 #16
0
                                   parameters=generator.parameters(),
                                   beta1=0.5,
                                   beta2=0.999)

optimizerD = paddle.optimizer.Adam(learning_rate=LR,
                                   parameters=discriminator.parameters(),
                                   beta1=0.5,
                                   beta2=0.999)

# 损失函数
bce_loss = nn.BCELoss()
l1_loss = nn.L1Loss()

# dataloader
data_loader_train = DataLoader(paired_dataset_train,
                               batch_size=BATCH_SIZE,
                               shuffle=True,
                               drop_last=True)

data_loader_test = DataLoader(paired_dataset_test, batch_size=BATCH_SIZE)

# In[11]:

results_save_path = 'work/results'
os.makedirs(results_save_path, exist_ok=True)  # 保存每个epoch的测试结果

weights_save_path = 'work/weights'
os.makedirs(weights_save_path, exist_ok=True)  # 保存模型

for epoch in range(EPOCHS):
    for data in tqdm(data_loader_train):
        real_A, real_B = data
예제 #17
0
def eval():
    paddle.disable_static()
    n_gpus = dist.get_world_size()
    rank = dist.get_rank()

    if n_gpus > 1:
        dist.init_parallel_env()

    args = parse_args()
    if not args.init_from_ckpt:
        raise ValueError('init_from_ckpt should be set when eval.')
    vocab = load_vocab(args.vocab_file, args.max_characters_per_token)

    elmo = ELMo(args.batch_size,
                args.char_embed_dim,
                args.projection_dim,
                vocab.size,
                dropout=args.dropout,
                num_layers=args.num_layers,
                num_highways=args.num_highways,
                char_vocab_size=vocab.char_size)
    if n_gpus > 1:
        elmo = paddle.DataParallel(elmo)
    elmo.eval()

    elmo_loss = ELMoLoss()

    # Loads pre-trained parameters.
    weight_state_dict = paddle.load(args.init_from_ckpt + '.pdparams')
    elmo.set_state_dict(weight_state_dict)
    print("Loaded checkpoint from %s" % args.init_from_ckpt)

    dev_dataset = OneBillionWordDataset(args.dev_data_path,
                                        vocab,
                                        args.batch_size,
                                        args.unroll_steps,
                                        n_gpus,
                                        rank,
                                        mode='test',
                                        shuffle=False,
                                        seed=args.random_seed)

    dev_dataloader = DataLoader(dev_dataset, return_list=True, batch_size=None)

    total_step = total_loss = 0
    total_time = 0.0
    batch_start_time = time.time()
    for step, inputs in enumerate(dev_dataloader, start=1):
        ids, next_ids, ids_reverse, next_ids_reverse = inputs
        outputs = elmo([ids, ids_reverse])
        loss = elmo_loss(outputs, [next_ids, next_ids_reverse])
        ppl = paddle.exp(loss)

        total_loss += loss.numpy()[0]
        total_step += 1

        total_time += (time.time() - batch_start_time)
        if rank == 0:
            if step % args.log_freq == 0:
                print(
                    "Eval step %d - loss: %.4f - Perplexity: %.4f - %.3fs/step"
                    % (step, loss.numpy()[0] * args.unroll_steps,
                       ppl.numpy()[0], total_time / args.log_freq))
                total_time = 0.0
        batch_start_time = time.time()

    avg_loss = total_loss / total_step
    avg_ppl = math.exp(avg_loss)
    if rank == 0:
        print("Eval - average loss: %.4f - average Perplexity: %.4f" %
              (avg_loss * args.unroll_steps, avg_ppl))
예제 #18
0
def do_predict(args):
    paddle.set_device(args.device)

    # Create dataset, tokenizer and dataloader.
    train_ds, predict_ds = load_dataset('msra_ner',
                                        splits=('train', 'test'),
                                        lazy=False)
    tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path)

    label_list = train_ds.label_list
    label_num = len(label_list)
    no_entity_id = label_num - 1
    trans_func = partial(tokenize_and_align_labels,
                         tokenizer=tokenizer,
                         no_entity_id=no_entity_id,
                         max_seq_len=args.max_seq_length)

    ignore_label = -100
    batchify_fn = lambda samples, fn=Dict(
        {
            'input_ids': Pad(axis=0, pad_val=tokenizer.pad_token_id),  # input
            'token_type_ids': Pad(axis=0, pad_val=tokenizer.pad_token_type_id
                                  ),  # segment
            'seq_len': Stack(),
            'labels': Pad(axis=0, pad_val=ignore_label)  # label
        }): fn(samples)
    raw_data = predict_ds.data

    id2label = dict(enumerate(predict_ds.label_list))

    predict_ds = predict_ds.map(trans_func)
    predict_data_loader = DataLoader(dataset=predict_ds,
                                     collate_fn=batchify_fn,
                                     num_workers=0,
                                     batch_size=args.batch_size,
                                     return_list=True)

    # Define the model netword
    model = BertForTokenClassification.from_pretrained(args.model_name_or_path,
                                                       num_classes=label_num)
    if args.init_checkpoint_path:
        model_dict = paddle.load(args.init_checkpoint_path)
        model.set_dict(model_dict)

    model.eval()
    pred_list = []
    len_list = []
    for step, batch in enumerate(predict_data_loader):
        input_ids, token_type_ids, length, labels = batch
        logits = model(input_ids, token_type_ids)
        pred = paddle.argmax(logits, axis=-1)
        pred_list.append(pred.numpy())
        len_list.append(length.numpy())

    preds = parse_decodes(raw_data, id2label, pred_list, len_list)

    file_path = "results.txt"
    with open(file_path, "w", encoding="utf8") as fout:
        fout.write("\n".join(preds))
    # Print some examples
    print(
        "The results have been saved in the file: %s, some examples are shown below: "
        % file_path)
    print("\n".join(preds[:10]))
예제 #19
0
def run(args):
    paddle.set_device(args.device)
    if paddle.distributed.get_world_size() > 1:
        paddle.distributed.init_parallel_env()
    rank = paddle.distributed.get_rank()

    task_name = args.task_name.lower()
    args.model_type = args.model_type.lower()
    model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
    tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path)
    set_seed(args)
    if rank == 0:
        if os.path.exists(args.model_name_or_path):
            print("init checkpoint from %s" % args.model_name_or_path)

    model = model_class.from_pretrained(args.model_name_or_path)

    if paddle.distributed.get_world_size() > 1:
        model = paddle.DataParallel(model)

    def prepare_train_features(examples):
        # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
        # in one example possible giving several features when a context is long, each of those features having a
        # context that overlaps a bit the context of the previous feature.
        # NOTE: Almost the same functionality as HuggingFace's prepare_train_features function. The main difference is
        # that HugggingFace uses ArrowTable as basic data structure, while we use list of dictionary instead.
        contexts = [examples[i]['context'] for i in range(len(examples))]
        questions = [examples[i]['question'] for i in range(len(examples))]

        tokenized_examples = tokenizer(questions,
                                       contexts,
                                       stride=args.doc_stride,
                                       max_seq_len=args.max_seq_length)

        # Let's label those examples!
        for i, tokenized_example in enumerate(tokenized_examples):
            # We will label impossible answers with the index of the CLS token.
            input_ids = tokenized_example["input_ids"]
            cls_index = input_ids.index(tokenizer.cls_token_id)

            # The offset mappings will give us a map from token to character position in the original context. This will
            # help us compute the start_positions and end_positions.
            offsets = tokenized_example['offset_mapping']

            # Grab the sequence corresponding to that example (to know what is the context and what is the question).
            sequence_ids = tokenized_example['token_type_ids']

            # One example can give several spans, this is the index of the example containing this span of text.
            sample_index = tokenized_example['overflow_to_sample']
            answers = examples[sample_index]['answers']
            answer_starts = examples[sample_index]['answer_starts']

            # Start/end character index of the answer in the text.
            start_char = answer_starts[0]
            end_char = start_char + len(answers[0])

            # Start token index of the current span in the text.
            token_start_index = 0
            while sequence_ids[token_start_index] != 1:
                token_start_index += 1

            # End token index of the current span in the text.
            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != 1:
                token_end_index -= 1
            # Minus one more to reach actual text
            token_end_index -= 1

            # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index).
            if not (offsets[token_start_index][0] <= start_char
                    and offsets[token_end_index][1] >= end_char):
                tokenized_examples[i]["start_positions"] = cls_index
                tokenized_examples[i]["end_positions"] = cls_index
            else:
                # Otherwise move the token_start_index and token_end_index to the two ends of the answer.
                # Note: we could go after the last offset if the answer is the last word (edge case).
                while token_start_index < len(offsets) and offsets[
                        token_start_index][0] <= start_char:
                    token_start_index += 1
                tokenized_examples[i][
                    "start_positions"] = token_start_index - 1
                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                tokenized_examples[i]["end_positions"] = token_end_index + 1

        return tokenized_examples

    if args.do_train:
        if args.train_file:
            train_ds = load_dataset(task_name, data_files=args.train_file)
        else:
            train_ds = load_dataset(task_name, splits='train')
        train_ds.map(prepare_train_features, batched=True)
        train_batch_sampler = paddle.io.DistributedBatchSampler(
            train_ds, batch_size=args.batch_size, shuffle=True)
        train_batchify_fn = lambda samples, fn=Dict(
            {
                "input_ids": Pad(axis=0, pad_val=tokenizer.pad_token_id),
                "token_type_ids": Pad(axis=0,
                                      pad_val=tokenizer.pad_token_type_id),
                "start_positions": Stack(dtype="int64"),
                "end_positions": Stack(dtype="int64")
            }): fn(samples)

        train_data_loader = DataLoader(dataset=train_ds,
                                       batch_sampler=train_batch_sampler,
                                       collate_fn=train_batchify_fn,
                                       return_list=True)

        num_training_steps = args.max_steps if args.max_steps > 0 else len(
            train_data_loader) * args.num_train_epochs
        num_train_epochs = math.ceil(num_training_steps /
                                     len(train_data_loader))

        lr_scheduler = LinearDecayWithWarmup(args.learning_rate,
                                             num_training_steps,
                                             args.warmup_proportion)

        # Generate parameter names needed to perform weight decay.
        # All bias and LayerNorm parameters are excluded.
        decay_params = [
            p.name for n, p in model.named_parameters()
            if not any(nd in n for nd in ["bias", "norm"])
        ]
        optimizer = paddle.optimizer.AdamW(
            learning_rate=lr_scheduler,
            epsilon=args.adam_epsilon,
            parameters=model.parameters(),
            weight_decay=args.weight_decay,
            apply_decay_param_fun=lambda x: x in decay_params)
        criterion = CrossEntropyLossForSQuAD()

        global_step = 0
        tic_train = time.time()
        for epoch in range(num_train_epochs):
            for step, batch in enumerate(train_data_loader):
                global_step += 1
                input_ids, token_type_ids, start_positions, end_positions = batch
                logits = model(input_ids=input_ids,
                               token_type_ids=token_type_ids)
                loss = criterion(logits, (start_positions, end_positions))

                if global_step % args.logging_steps == 0:
                    print(
                        "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s"
                        % (global_step, epoch + 1, step + 1, loss,
                           args.logging_steps / (time.time() - tic_train)))
                    tic_train = time.time()
                loss.backward()
                optimizer.step()
                lr_scheduler.step()
                optimizer.clear_grad()

                if global_step % args.save_steps == 0 or global_step == num_training_steps:
                    if rank == 0:
                        output_dir = os.path.join(args.output_dir,
                                                  "model_%d" % global_step)
                        if not os.path.exists(output_dir):
                            os.makedirs(output_dir)
                        # need better way to get inner model of DataParallel
                        model_to_save = model._layers if isinstance(
                            model, paddle.DataParallel) else model
                        model_to_save.save_pretrained(output_dir)
                        tokenizer.save_pretrained(output_dir)
                        print('Saving checkpoint to:', output_dir)
                    if global_step == num_training_steps:
                        break

    def prepare_validation_features(examples):
        # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
        # in one example possible giving several features when a context is long, each of those features having a
        # context that overlaps a bit the context of the previous feature.
        # NOTE: Almost the same functionality as HuggingFace's prepare_train_features function. The main difference is
        # that HugggingFace uses ArrowTable as basic data structure, while we use list of dictionary instead.
        contexts = [examples[i]['context'] for i in range(len(examples))]
        questions = [examples[i]['question'] for i in range(len(examples))]

        tokenized_examples = tokenizer(questions,
                                       contexts,
                                       stride=args.doc_stride,
                                       max_seq_len=args.max_seq_length)

        # For validation, there is no need to compute start and end positions
        for i, tokenized_example in enumerate(tokenized_examples):
            # Grab the sequence corresponding to that example (to know what is the context and what is the question).
            sequence_ids = tokenized_example['token_type_ids']

            # One example can give several spans, this is the index of the example containing this span of text.
            sample_index = tokenized_example['overflow_to_sample']
            tokenized_examples[i]["example_id"] = examples[sample_index]['id']

            # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token
            # position is part of the context or not.
            tokenized_examples[i]["offset_mapping"] = [
                (o if sequence_ids[k] == 1 else None)
                for k, o in enumerate(tokenized_example["offset_mapping"])
            ]

        return tokenized_examples

    if args.do_predict and rank == 0:

        if args.predict_file:
            dev_ds = load_dataset(task_name, data_files=args.predict_file)
        else:
            dev_ds = load_dataset(task_name, splits='dev')

        dev_ds.map(prepare_validation_features, batched=True)
        dev_batch_sampler = paddle.io.BatchSampler(dev_ds,
                                                   batch_size=args.batch_size,
                                                   shuffle=False)

        dev_batchify_fn = lambda samples, fn=Dict({
            "input_ids":
            Pad(axis=0, pad_val=tokenizer.pad_token_id),
            "token_type_ids":
            Pad(axis=0, pad_val=tokenizer.pad_token_type_id)
        }): fn(samples)

        dev_data_loader = DataLoader(dataset=dev_ds,
                                     batch_sampler=dev_batch_sampler,
                                     collate_fn=dev_batchify_fn,
                                     return_list=True)

        evaluate(model, dev_data_loader, args)
예제 #20
0
def main(args):
    paddle.set_device('gpu' if args.n_gpus else 'cpu')
    paddle.seed(args.seed)
    world_size = dist.get_world_size()
    rank = dist.get_rank()
    if world_size > 1:
        dist.init_parallel_env()

    model = UnifiedTransformerLMHeadModel.from_pretrained(
        args.model_name_or_path)
    tokenizer = UnifiedTransformerTokenizer.from_pretrained(
        args.model_name_or_path)
    if world_size > 1:
        model = paddle.DataParallel(model)

    train_dataset = DialogueDataset(args.train_data_path,
                                    args.batch_size,
                                    tokenizer.pad_token_id,
                                    tokenizer.cls_token_id,
                                    args.sort_pool_size,
                                    args.seed,
                                    mode='train')
    train_dataloader = DataLoader(train_dataset,
                                  return_list=True,
                                  batch_size=None)
    valid_dataset = DialogueDataset(args.valid_data_path,
                                    args.batch_size,
                                    tokenizer.pad_token_id,
                                    tokenizer.cls_token_id,
                                    args.sort_pool_size,
                                    mode='valid')
    valid_dataloader = DataLoader(valid_dataset,
                                  return_list=True,
                                  batch_size=None)

    lr_scheduler = NoamDecay(1 / (args.warmup_steps * (args.lr**2)),
                             args.warmup_steps)
    optimizer = AdamW(learning_rate=lr_scheduler,
                      parameters=model.parameters(),
                      weight_decay=args.weight_decay,
                      apply_decay_param_fun=lambda x: x in [
                          p.name for n, p in model.named_parameters()
                          if not any(nd in n for nd in ["bias", "norm"])
                      ],
                      grad_clip=nn.ClipGradByGlobalNorm(args.max_grad_norm))

    step = 0
    total_time = 0.0
    for epoch in range(args.epochs):
        if rank == 0:
            print('\nEpoch %d/%d' % (epoch + 1, args.epochs))
        batch_start_time = time.time()
        for inputs in train_dataloader:
            step += 1
            token_ids, type_ids, pos_ids, generation_mask, tgt_label, tgt_pos = inputs

            logits = model(token_ids, type_ids, pos_ids, generation_mask,
                           tgt_pos)
            loss = F.cross_entropy(logits, tgt_label)
            loss.backward()
            optimizer.step()
            lr_scheduler.step()
            optimizer.clear_grad()

            total_time += (time.time() - batch_start_time)
            if rank == 0:
                if step % args.logging_steps == 0:
                    ppl = paddle.exp(loss)
                    print(
                        'step %d - loss: %.4f - ppl: %.4f - lr: %.7f - %.3fs/step'
                        % (step, loss, ppl, optimizer.get_lr(),
                           total_time / args.logging_steps))
                    total_time = 0.0
                if step % args.save_steps == 0:
                    evaluation(model, valid_dataloader)
                    save_ckpt(model, tokenizer, args.save_dir, step)
            batch_start_time = time.time()
예제 #21
0
def run(args):
    if args.do_train:
        assert args.batch_size % args.gradient_accumulation_steps == 0, \
            "Please make sure argmument `batch_size` must be divisible by `gradient_accumulation_steps`."
    paddle.set_device(args.device)
    if paddle.distributed.get_world_size() > 1:
        paddle.distributed.init_parallel_env()
    rank = paddle.distributed.get_rank()

    tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)

    set_seed(args)

    train_examples, dev_examples, test_examples = load_dataset(
        'clue', 'cmrc2018', split=["train", "validation", "test"])

    column_names = train_examples.column_names
    if rank == 0:
        if os.path.exists(args.model_name_or_path):
            logger.info("init checkpoint from %s" % args.model_name_or_path)

    model = AutoModelForQuestionAnswering.from_pretrained(
        args.model_name_or_path)

    if paddle.distributed.get_world_size() > 1:
        model = paddle.DataParallel(model)

    def prepare_train_features(examples):
        # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
        # in one example possible giving several features when a context is long, each of those features having a
        # context that overlaps a bit the context of the previous feature.
        # NOTE: Almost the same functionality as HuggingFace's prepare_train_features function. The main difference is
        # that HugggingFace uses ArrowTable as basic data structure, while we use list of dictionary instead.
        contexts = examples['context']
        questions = examples['question']

        tokenized_examples = tokenizer(questions,
                                       contexts,
                                       stride=args.doc_stride,
                                       max_seq_len=args.max_seq_length)

        # Since one example might give us several features if it has a long context, we need a map from a feature to
        # its corresponding example. This key gives us just that.
        sample_mapping = tokenized_examples.pop("overflow_to_sample")
        # The offset mappings will give us a map from token to character position in the original context. This will
        # help us compute the start_positions and end_positions.
        offset_mapping = tokenized_examples.pop("offset_mapping")

        # Let's label those examples!
        tokenized_examples["start_positions"] = []
        tokenized_examples["end_positions"] = []

        for i, offsets in enumerate(offset_mapping):
            # We will label impossible answers with the index of the CLS token.
            input_ids = tokenized_examples["input_ids"][i]
            cls_index = input_ids.index(tokenizer.cls_token_id)

            # Grab the sequence corresponding to that example (to know what is the context and what is the question).
            sequence_ids = tokenized_examples['token_type_ids'][i]

            # One example can give several spans, this is the index of the example containing this span of text.
            sample_index = sample_mapping[i]
            answers = examples['answers'][sample_index]
            # If no answers are given, set the cls_index as answer.
            if len(answers["answer_start"]) == 0:
                tokenized_examples["start_positions"].append(cls_index)
                tokenized_examples["end_positions"].append(cls_index)
            else:
                # Start/end character index of the answer in the text.
                start_char = answers["answer_start"][0]
                end_char = start_char + len(answers["text"][0])

                # Start token index of the current span in the text.
                token_start_index = 0
                while sequence_ids[token_start_index] != 1:
                    token_start_index += 1

                # End token index of the current span in the text.
                token_end_index = len(input_ids) - 1
                while sequence_ids[token_end_index] != 1:
                    token_end_index -= 1
                token_end_index -= 1

                # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index).
                if not (offsets[token_start_index][0] <= start_char
                        and offsets[token_end_index][1] >= end_char):
                    tokenized_examples["start_positions"].append(cls_index)
                    tokenized_examples["end_positions"].append(cls_index)
                else:
                    # Otherwise move the token_start_index and token_end_index to the two ends of the answer.
                    # Note: we could go after the last offset if the answer is the last word (edge case).
                    while token_start_index < len(offsets) and offsets[
                            token_start_index][0] <= start_char:
                        token_start_index += 1
                    tokenized_examples["start_positions"].append(
                        token_start_index - 1)
                    while offsets[token_end_index][1] >= end_char:
                        token_end_index -= 1
                    tokenized_examples["end_positions"].append(
                        token_end_index + 1)

        return tokenized_examples

    def prepare_validation_features(examples):
        # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
        # in one example possible giving several features when a context is long, each of those features having a
        # context that overlaps a bit the context of the previous feature.
        #NOTE: Almost the same functionality as HuggingFace's prepare_train_features function. The main difference is
        # that HuggingFace uses ArrowTable as basic data structure, while we use list of dictionary instead.
        contexts = examples['context']
        questions = examples['question']

        tokenized_examples = tokenizer(questions,
                                       contexts,
                                       stride=args.doc_stride,
                                       max_seq_len=args.max_seq_length,
                                       return_attention_mask=True)

        # Since one example might give us several features if it has a long context, we need a map from a feature to
        # its corresponding example. This key gives us just that.
        sample_mapping = tokenized_examples.pop("overflow_to_sample")

        # For evaluation, we will need to convert our predictions to substrings of the context, so we keep the
        # corresponding example_id and we will store the offset mappings.
        tokenized_examples["example_id"] = []

        for i in range(len(tokenized_examples["input_ids"])):
            # Grab the sequence corresponding to that example (to know what is the context and what is the question).
            sequence_ids = tokenized_examples['token_type_ids'][i]
            context_index = 1

            # One example can give several spans, this is the index of the example containing this span of text.
            sample_index = sample_mapping[i]
            tokenized_examples["example_id"].append(
                examples["id"][sample_index])

            # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token
            # position is part of the context or not.
            tokenized_examples["offset_mapping"][i] = [
                (o if sequence_ids[k] == context_index
                 and k != len(sequence_ids) - 1 else None)
                for k, o in enumerate(tokenized_examples["offset_mapping"][i])
            ]

        return tokenized_examples

    if args.do_train:
        args.batch_size = int(args.batch_size /
                              args.gradient_accumulation_steps)

        with main_process_first(desc="train dataset map pre-processing"):
            train_ds = train_examples.map(
                prepare_train_features,
                batched=True,
                remove_columns=column_names,
                load_from_cache_file=not args.overwrite_cache,
                num_proc=args.num_proc,
                desc="Running tokenizer on train dataset")
        train_batch_sampler = paddle.io.DistributedBatchSampler(
            train_ds, batch_size=args.batch_size, shuffle=True)

        batchify_fn = DataCollatorWithPadding(tokenizer)
        train_data_loader = DataLoader(dataset=train_ds,
                                       batch_sampler=train_batch_sampler,
                                       collate_fn=batchify_fn,
                                       return_list=True)

        with main_process_first(desc="evaluate dataset map pre-processing"):
            dev_ds = dev_examples.map(
                prepare_validation_features,
                batched=True,
                remove_columns=column_names,
                num_proc=args.num_proc,
                load_from_cache_file=args.overwrite_cache,
                desc="Running tokenizer on validation dataset")
        dev_ds_for_model = dev_ds.remove_columns(
            ["example_id", "offset_mapping", "attention_mask"])
        dev_batch_sampler = paddle.io.BatchSampler(
            dev_ds, batch_size=args.eval_batch_size, shuffle=False)

        dev_data_loader = DataLoader(dataset=dev_ds_for_model,
                                     batch_sampler=dev_batch_sampler,
                                     collate_fn=batchify_fn,
                                     return_list=True)

        num_training_steps = int(
            args.max_steps /
            args.gradient_accumulation_steps) if args.max_steps >= 0 else int(
                len(train_data_loader) * args.num_train_epochs /
                args.gradient_accumulation_steps)

        warmup = args.warmup_steps if args.warmup_steps > 0 else args.warmup_proportion
        lr_scheduler = LinearDecayWithWarmup(args.learning_rate,
                                             num_training_steps, warmup)

        # Generate parameter names needed to perform weight decay.
        # All bias and LayerNorm parameters are excluded.
        decay_params = [
            p.name for n, p in model.named_parameters()
            if not any(nd in n for nd in ["bias", "norm"])
        ]
        optimizer = paddle.optimizer.AdamW(
            learning_rate=lr_scheduler,
            epsilon=args.adam_epsilon,
            parameters=model.parameters(),
            weight_decay=args.weight_decay,
            apply_decay_param_fun=lambda x: x in decay_params)
        criterion = CrossEntropyLossForSQuAD()
        best_res = (0.0, 0.0)
        global_step = 0
        tic_train = time.time()
        for epoch in range(args.num_train_epochs):
            for step, batch in enumerate(train_data_loader):
                start_positions = batch.pop("start_positions")
                end_positions = batch.pop("end_positions")
                logits = model(**batch)
                loss = criterion(logits, (start_positions, end_positions))
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps
                loss.backward()
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    global_step += 1
                    optimizer.step()
                    lr_scheduler.step()
                    optimizer.clear_grad()

                    if global_step % args.logging_steps == 0:
                        logger.info(
                            "global step %d/%d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s"
                            % (global_step, num_training_steps, epoch,
                               step + 1, loss, args.logging_steps /
                               (time.time() - tic_train)))
                        tic_train = time.time()
                    if global_step >= num_training_steps:
                        logger.info("best_result: %.2f/%.2f" %
                                    (best_res[0], best_res[1]))
                        return
            em, f1 = evaluate(model, dev_examples, dev_ds, dev_data_loader,
                              args)
            if paddle.distributed.get_rank() == 0 and em > best_res[0]:
                best_res = (em, f1)
                if args.save_best_model:
                    output_dir = args.output_dir
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                    # need better way to get inner model of DataParallel
                    model_to_save = model._layers if isinstance(
                        model, paddle.DataParallel) else model
                    model_to_save.save_pretrained(output_dir)
                    tokenizer.save_pretrained(output_dir)
        logger.info("best_result: %.2f/%.2f" % (best_res[0], best_res[1]))

    if args.do_predict and rank == 0:
        test_ds = test_examples.map(prepare_validation_features,
                                    batched=True,
                                    remove_columns=column_names,
                                    num_proc=args.num_proc)
        test_ds_for_model = test_ds.remove_columns(
            ["example_id", "offset_mapping", "attention_mask"])
        dev_batchify_fn = DataCollatorWithPadding(tokenizer)

        test_batch_sampler = paddle.io.BatchSampler(
            test_ds_for_model, batch_size=args.eval_batch_size, shuffle=False)

        batchify_fn = DataCollatorWithPadding(tokenizer)
        test_data_loader = DataLoader(dataset=test_ds_for_model,
                                      batch_sampler=test_batch_sampler,
                                      collate_fn=batchify_fn,
                                      return_list=True)

        evaluate(model,
                 test_examples,
                 test_ds,
                 test_data_loader,
                 args,
                 do_eval=False)
예제 #22
0
    def get_data_sample(self, task_id, train=True):
        if train:
            task = self.train_tasks[task_id]
            if task in self.preload_train_data:
                dataset = self.preload_train_data[task]
            else:
                dataset = MoleculeDataset(self.data_dir + self.dataset +
                                          "/new/" + str(task + 1),
                                          dataset=self.dataset)
            if self.update_s_q:
                s_data, q_data, s_data_eval, q_data_eval = sample_meta_datasets(
                    dataset, self.dataset, task, self.n_shot_train,
                    self.n_query)

                s_data_y = np.stack([i.y[0] for i in s_data.data_list])
                q_data_y = np.stack([i.y[0] for i in q_data.data_list])
                s_data_eval_y = np.stack(
                    [i.y[0] for i in s_data_eval.data_list])
                q_data_eval_y = np.stack(
                    [i.y[0] for i in q_data_eval.data_list])
                adapt_data = {
                    's_data': G.Graph.batch(s_data.data_list),
                    's_label': paddle.to_tensor(s_data_y),
                    'q_data': G.Graph.batch(q_data.data_list),
                    'q_label': paddle.to_tensor(q_data_y),
                    'label':
                    paddle.to_tensor(np.concatenate([s_data_y, q_data_y]))
                }
                eval_data = {
                    's_data':
                    G.Graph.batch(s_data_eval.data_list),
                    's_label':
                    paddle.to_tensor(s_data_eval_y),
                    'q_data':
                    G.Graph.batch(q_data_eval.data_list),
                    'q_label':
                    paddle.to_tensor(q_data_eval_y),
                    'label':
                    paddle.to_tensor(
                        np.concatenate([s_data_eval_y, q_data_eval_y]))
                }
            else:
                s_data, q_data = sample_datasets(dataset, self.dataset, task,
                                                 self.n_shot_train,
                                                 self.n_query)

                s_data_y = np.stack([i.y[0] for i in s_data.data_list])
                q_data_y = np.stack([i.y[0] for i in q_data.data_list])
                adapt_data = {
                    'data': G.Graph.batch(s_data.data_list),
                    'label': paddle.to_tensor(s_data_y)
                }
                eval_data = {
                    'data': G.Graph.batch(q_data.data_list),
                    'label': paddle.to_tensor(q_data_y)
                }
        else:
            task = self.test_tasks[task_id]
            if 'train' in self.dataset:
                dataset = self.preload_test_data[task]
                if self.args.support_valid:
                    val_dataset = self.preload_valid_data[task]
                    data_name = self.dataset.replace('train', 'valid')
                else:
                    val_dataset = self.preload_train_data[task]
                    data_name = self.dataset
                s_data, _, q_data_adapt = sample_test_datasets(
                    val_dataset, data_name, task, self.n_shot_test,
                    self.n_query, self.update_step_test)
                s_data = self.loader_to_samples(s_data)
                q_loader = DataLoader(dataset,
                                      batch_size=self.n_query,
                                      shuffle=True,
                                      num_workers=0)
                q_loader_adapt = DataLoader(q_data_adapt,
                                            batch_size=self.n_query,
                                            shuffle=True,
                                            num_workers=0)
                adapt_data = {
                    's_data': s_data,
                    's_label': s_data.y,
                    'data_loader': q_loader_adapt
                }
                eval_data = {
                    's_data': s_data,
                    's_label': s_data.y,
                    'data_loader': q_loader
                }
                return adapt_data, eval_data
            if task in self.preload_test_data:
                dataset = self.preload_test_data[task]
            else:
                dataset = MoleculeDataset(self.data_dir + self.test_dataset +
                                          "/new/" + str(task + 1),
                                          dataset=self.test_dataset)
            s_data, q_data, q_data_adapt = sample_test_datasets(
                dataset, self.test_dataset, task, self.n_shot_test,
                self.n_query, self.update_step_test)

            s_data_y = np.stack([i.y[0] for i in s_data.data_list])

            q_loader = q_data.get_data_loader(batch_size=self.n_query,
                                              shuffle=True,
                                              num_workers=1)
            q_loader_adapt = q_data_adapt.get_data_loader(
                batch_size=self.n_query, shuffle=True, num_workers=1)
            if self.update_s_q:
                adapt_data = {
                    's_data': G.Graph.batch(s_data.data_list),
                    's_label': paddle.to_tensor(s_data_y),
                    'data_loader': q_loader_adapt
                }
                eval_data = {
                    's_data': G.Graph.batch(s_data.data_list),
                    's_label': paddle.to_tensor(s_data_y),
                    'data_loader': q_loader
                }

            else:
                adapt_data = {'data_loader': [s_data] * self.update_step_test}
                eval_data = {'data_loader': q_loader}

        return adapt_data, eval_data
예제 #23
0
파일: run_du.py 프로젝트: wbj0110/models
def do_train(args):
    paddle.set_device("gpu" if args.n_gpu else "cpu")
    if paddle.distributed.get_world_size() > 1:
        paddle.distributed.init_parallel_env()

    task_name = args.task_name.lower()
    dataset_class = TASK_CLASSES[task_name]

    args.model_type = args.model_type.lower()
    model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
    tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path)
    root = args.data_path
    set_seed(args)

    train_ds = dataset_class(
        tokenizer=tokenizer,
        root=root,
        doc_stride=args.doc_stride,
        max_query_length=args.max_query_length,
        max_seq_length=args.max_seq_length,
        mode='train')

    train_batch_sampler = paddle.io.DistributedBatchSampler(
        train_ds, batch_size=args.batch_size, shuffle=True)

    train_batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token]),  # input
        Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token]),  # segment
        Stack(),  # unipue_id
        Stack(dtype="int64"),  # start_pos
        Stack(dtype="int64")  # end_pos
    ): [data for i, data in enumerate(fn(samples)) if i != 2]

    train_data_loader = DataLoader(
        dataset=train_ds,
        batch_sampler=train_batch_sampler,
        collate_fn=train_batchify_fn,
        return_list=True)

    dev_ds = dataset_class(
        tokenizer=tokenizer,
        root=root,
        doc_stride=args.doc_stride,
        max_query_length=args.max_query_length,
        max_seq_length=args.max_seq_length,
        mode='dev')

    dev_batch_sampler = paddle.io.BatchSampler(
        dev_ds, batch_size=args.batch_size, shuffle=False)

    dev_batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token]),  # input
        Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token]),  # segment
        Stack()  # unipue_id
    ): fn(samples)

    dev_data_loader = DataLoader(
        dataset=dev_ds,
        batch_sampler=dev_batch_sampler,
        collate_fn=dev_batchify_fn,
        return_list=True)

    model = model_class.from_pretrained(args.model_name_or_path)

    if paddle.distributed.get_world_size() > 1:
        model = paddle.DataParallel(model)

    lr_scheduler = paddle.optimizer.lr.LambdaDecay(
        args.learning_rate,
        lambda current_step, warmup_proportion=args.warmup_proportion,
        num_training_steps=args.max_steps if args.max_steps > 0 else
        (len(train_data_loader)*args.num_train_epochs): float(
            current_step) / float(max(1, warmup_proportion*num_training_steps))
        if current_step < warmup_proportion*num_training_steps else max(
            0.0,
            float(num_training_steps - current_step) / float(
                max(1, num_training_steps - warmup_proportion*num_training_steps))))

    optimizer = paddle.optimizer.AdamW(
        learning_rate=lr_scheduler,
        epsilon=args.adam_epsilon,
        parameters=model.parameters(),
        weight_decay=args.weight_decay,
        apply_decay_param_fun=lambda x: x in [
            p.name for n, p in model.named_parameters()
            if not any(nd in n for nd in ["bias", "norm"])
        ])
    criterion = CrossEntropyLossForSQuAD()

    global_step = 0
    tic_train = time.time()
    for epoch in range(args.num_train_epochs):
        for step, batch in enumerate(train_data_loader):
            global_step += 1
            input_ids, segment_ids, start_positions, end_positions = batch

            logits = model(input_ids=input_ids, token_type_ids=segment_ids)
            loss = criterion(logits, (start_positions, end_positions))

            if global_step % args.logging_steps == 0:
                print(
                    "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s"
                    % (global_step, epoch, step, loss,
                       args.logging_steps / (time.time() - tic_train)))
                tic_train = time.time()
            loss.backward()
            optimizer.step()
            lr_scheduler.step()
            optimizer.clear_gradients()

            if global_step % args.save_steps == 0:
                if (not args.n_gpu > 1) or paddle.distributed.get_rank() == 0:
                    output_dir = os.path.join(args.output_dir,
                                              "model_%d" % global_step)
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                    # need better way to get inner model of DataParallel
                    model_to_save = model._layers if isinstance(
                        model, paddle.DataParallel) else model
                    model_to_save.save_pretrained(output_dir)
                    tokenizer.save_pretrained(output_dir)
                    print('Saving checkpoint to:', output_dir)

        if (not args.n_gpu > 1) or paddle.distributed.get_rank() == 0:
            evaluate(model, dev_data_loader, args, tokenizer)
예제 #24
0
def train():
    paddle.set_device(args.device)
    if paddle.distributed.get_world_size() > 1:
        paddle.distributed.init_parallel_env()

    model = ErnieForGeneration.from_pretrained(args.model_name_or_path)
    if "ernie-tiny" in args.model_name_or_path:
        tokenizer = ErnieTinyTokenizer.from_pretrained(args.model_name_or_path)
    elif "ernie" in args.model_name_or_path:
        tokenizer = ErnieTokenizer.from_pretrained(args.model_name_or_path)
    elif "roberta" in args.model_name_or_path or "rbt" in args.model_name_or_path:
        tokenizer = RobertaTokenizer.from_pretrained(args.model_name_or_path)
    elif "electra" in args.model_name_or_path:
        tokenizer = ElectraTokenizer.from_pretrained(args.model_name_or_path)
    else:
        tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path)
    if args.init_checkpoint:
        model_state = paddle.load(args.init_checkpoint)
        model.set_state_dict(model_state)

    train_dataset, dev_dataset = load_dataset(
        'poetry', splits=('train', 'dev'), lazy=False)
    attn_id = tokenizer.vocab[
        '[ATTN]'] if '[ATTN]' in tokenizer.vocab else tokenizer.vocab['[MASK]']
    tgt_type_id = model.sent_emb.weight.shape[0] - 1

    trans_func = convert_example(
        tokenizer=tokenizer,
        attn_id=attn_id,
        tgt_type_id=tgt_type_id,
        max_encode_len=args.max_encode_len,
        max_decode_len=args.max_decode_len,
        noise_prob=args.noise_prob,
        use_random_noice=args.use_random_noice)

    train_dataset = train_dataset.map(trans_func)
    train_batch_sampler = paddle.io.DistributedBatchSampler(
        train_dataset, batch_size=args.batch_size, shuffle=True)
    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=tokenizer.pad_token_id),  # src_ids
        Pad(axis=0, pad_val=tokenizer.pad_token_id),  # src_pids
        Pad(axis=0, pad_val=tokenizer.pad_token_type_id),  # src_tids
        Pad(axis=0, pad_val=tokenizer.pad_token_id),  # tgt_ids
        Pad(axis=0, pad_val=tokenizer.pad_token_id),  # tgt_pids
        Pad(axis=0, pad_val=tokenizer.pad_token_type_id),  # tgt_tids
        Pad(axis=0, pad_val=tokenizer.pad_token_id),  # attn_ids
        Pad(axis=0, pad_val=tokenizer.pad_token_id),  # tgt_labels
    ): after_padding(fn(samples))
    train_data_loader = DataLoader(
        dataset=train_dataset,
        batch_sampler=train_batch_sampler,
        collate_fn=batchify_fn,
        num_workers=0,
        return_list=True)

    dev_dataset = dev_dataset.map(trans_func)
    dev_data_loader = DataLoader(
        dataset=dev_dataset,
        batch_size=args.batch_size,
        collate_fn=batchify_fn,
        num_workers=0,
        return_list=True)

    label_num = model.word_emb.weight.shape[0]
    train_model = StackModel(model)
    if paddle.distributed.get_world_size() > 1:
        # All 'forward' outputs derived from the module parameters using in DataParallel
        # must participate in the calculation of losses and subsequent gradient calculations.
        # So we use StackModel here to make the model only output loss in its 'forward' function.
        train_model = paddle.DataParallel(train_model)

    max_steps = len(train_data_loader) * args.num_epochs

    lr_scheduler = LinearDecayWithWarmup(args.learning_rate, max_steps,
                                         args.warmup_proportion)

    # Generate parameter names needed to perform weight decay.
    # All bias and LayerNorm parameters are excluded.
    decay_params = [
        p.name for n, p in model.named_parameters()
        if not any(nd in n for nd in ["bias", "norm"])
    ]
    optimizer = paddle.optimizer.AdamW(
        learning_rate=lr_scheduler,
        epsilon=args.adam_epsilon,
        parameters=model.parameters(),
        weight_decay=args.weight_decay,
        grad_clip=nn.ClipGradByGlobalNorm(1.0),
        apply_decay_param_fun=lambda x: x in decay_params)

    rouge1 = Rouge1()
    rouge2 = Rouge2()

    global_step = 1
    tic_train = time.time()
    for epoch in range(args.num_epochs):
        for step, batch in enumerate(train_data_loader, start=1):
            (src_ids, src_tids, src_pids, tgt_ids, tgt_tids, tgt_pids, attn_ids,
             mask_src_2_src, mask_tgt_2_srctgt, mask_attn_2_srctgtattn,
             tgt_labels, _) = batch
            # import pdb; pdb.set_trace()
            if args.label_smooth > 0.:
                tgt_labels = nn.functional.label_smooth(
                    nn.functional.one_hot(tgt_labels, label_num),
                    epsilon=args.label_smooth)
            tgt_pos = paddle.nonzero(attn_ids == attn_id)
            loss = train_model(src_ids, src_tids, src_pids, tgt_ids, tgt_tids,
                               tgt_pids, attn_ids, mask_src_2_src,
                               mask_tgt_2_srctgt, mask_attn_2_srctgtattn,
                               tgt_labels, tgt_pos)
            if global_step % args.logging_steps == 0:
                if paddle.distributed.get_rank() == 0:
                    logger.info(
                        "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s, lr: %.3e"
                        % (global_step, epoch, step, loss, args.logging_steps /
                           (time.time() - tic_train), lr_scheduler.get_lr()))
                tic_train = time.time()

            loss.backward()
            optimizer.step()
            lr_scheduler.step()
            optimizer.clear_grad()
            if global_step % args.save_steps == 0 and paddle.distributed.get_rank(
            ) == 0:
                evaluate(model, dev_data_loader, tokenizer, rouge1, rouge2,
                         attn_id, tgt_type_id, args)
                output_dir = os.path.join(args.output_dir,
                                          "model_%d" % global_step)
                if not os.path.exists(output_dir):
                    os.makedirs(output_dir)
                model_to_save = model._layers if isinstance(
                    model, paddle.DataParallel) else model
                model_to_save.save_pretrained(output_dir)
                tokenizer.save_pretrained(output_dir)
            global_step += 1
예제 #25
0
def do_train(args):
    paddle.set_device(args.device)
    if paddle.distributed.get_world_size() > 1:
        paddle.distributed.init_parallel_env()

    # Create dataset, tokenizer and dataloader.
    if args.dataset == "peoples_daily_ner":
        raw_datasets = load_dataset(args.dataset)
    else:
        raw_datasets = load_dataset(args.dataset)

    AutoForTokenClassification, AutoTokenizer = MODEL_CLASSES[args.model_type]
    tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
    train_ds = raw_datasets['train']

    label_list = train_ds.features['ner_tags'].feature.names
    label_num = len(label_list)
    no_entity_id = 0

    def tokenize_and_align_labels(examples):
        tokenized_inputs = tokenizer(
            examples['tokens'],
            max_seq_len=args.max_seq_length,
            # We use this argument because the texts in our dataset are lists of words (with a label for each word).
            is_split_into_words=True,
            return_length=True)
        labels = []

        for i, label in enumerate(examples['ner_tags']):
            label_ids = label
            if len(tokenized_inputs['input_ids'][i]) - 2 < len(label_ids):
                label_ids = label_ids[:len(tokenized_inputs['input_ids'][i]) -
                                      2]
            label_ids = [no_entity_id] + label_ids + [no_entity_id]
            label_ids += [no_entity_id] * (
                len(tokenized_inputs['input_ids'][i]) - len(label_ids))

            labels.append(label_ids)
        tokenized_inputs["labels"] = labels
        return tokenized_inputs

    train_ds = train_ds.select(range(len(train_ds) - 1))
    train_ds = train_ds.map(tokenize_and_align_labels, batched=True)

    ignore_label = -100

    batchify_fn = lambda samples, fn=Dict({
        'input_ids':
        Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype='int32'),  # input
        'token_type_ids':
        Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype='int32'
            ),  # segment
        'seq_len':
        Stack(dtype='int64'),  # seq_len
        'labels':
        Pad(axis=0, pad_val=ignore_label, dtype='int64')  # label
    }): fn(samples)

    train_batch_sampler = paddle.io.DistributedBatchSampler(
        train_ds, batch_size=args.batch_size, shuffle=True, drop_last=True)

    train_data_loader = DataLoader(dataset=train_ds,
                                   collate_fn=batchify_fn,
                                   num_workers=0,
                                   batch_sampler=train_batch_sampler,
                                   return_list=True)

    test_ds = raw_datasets['test']
    test_ds = test_ds.select(range(len(test_ds) - 1))
    test_ds = test_ds.map(tokenize_and_align_labels, batched=True)

    test_data_loader = DataLoader(dataset=test_ds,
                                  collate_fn=batchify_fn,
                                  num_workers=0,
                                  batch_size=args.batch_size,
                                  return_list=True)

    if args.dataset == "peoples_daily_ner":
        dev_ds = raw_datasets['validation']
        dev_ds = dev_ds.select(range(len(dev_ds) - 1))
        dev_ds = dev_ds.map(tokenize_and_align_labels, batched=True)

        dev_data_loader = DataLoader(dataset=dev_ds,
                                     collate_fn=batchify_fn,
                                     num_workers=0,
                                     batch_size=args.batch_size,
                                     return_list=True)

    # Define the model netword and its loss
    model = AutoForTokenClassification.from_pretrained(args.model_name_or_path,
                                                       num_classes=label_num)

    if paddle.distributed.get_world_size() > 1:
        model = paddle.DataParallel(model)

    num_training_steps = args.max_steps if args.max_steps > 0 else len(
        train_data_loader) * args.num_train_epochs

    lr_scheduler = LinearDecayWithWarmup(args.learning_rate,
                                         num_training_steps, args.warmup_steps)

    # Generate parameter names needed to perform weight decay.
    # All bias and LayerNorm parameters are excluded.
    decay_params = [
        p.name for n, p in model.named_parameters()
        if not any(nd in n for nd in ["bias", "norm"])
    ]
    optimizer = paddle.optimizer.AdamW(
        learning_rate=lr_scheduler,
        epsilon=args.adam_epsilon,
        parameters=model.parameters(),
        weight_decay=args.weight_decay,
        apply_decay_param_fun=lambda x: x in decay_params)

    loss_fct = paddle.nn.loss.CrossEntropyLoss(ignore_index=ignore_label)

    metric = ChunkEvaluator(label_list=label_list)

    global_step = 0
    last_step = args.num_train_epochs * len(train_data_loader)
    tic_train = time.time()
    for epoch in range(args.num_train_epochs):
        for step, batch in enumerate(train_data_loader):
            global_step += 1
            input_ids, token_type_ids, _, labels = batch
            logits = model(input_ids, token_type_ids)
            loss = loss_fct(logits, labels)
            avg_loss = paddle.mean(loss)
            if global_step % args.logging_steps == 0:
                print(
                    "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s"
                    % (global_step, epoch, step, avg_loss, args.logging_steps /
                       (time.time() - tic_train)))
                tic_train = time.time()
            avg_loss.backward()
            optimizer.step()
            lr_scheduler.step()
            optimizer.clear_grad()
            if global_step % args.save_steps == 0 or global_step == num_training_steps:
                if paddle.distributed.get_rank() == 0:
                    if args.dataset == "peoples_daily_ner":
                        evaluate(model, loss_fct, metric, dev_data_loader,
                                 label_num, "valid")
                    evaluate(model, loss_fct, metric, test_data_loader,
                             label_num, "test")

                    paddle.save(
                        model.state_dict(),
                        os.path.join(args.output_dir,
                                     "model_%d.pdparams" % global_step))
            if global_step >= num_training_steps:
                return
예제 #26
0
USE_GPU = False  # whether use GPU to run model


# define a random dataset
class RandomDataset(Dataset):
    def __init__(self, num_samples):
        self.num_samples = num_samples

    def __getitem__(self, idx):
        image = np.random.random([IMAGE_SIZE]).astype('float32')
        label = np.random.randint(0, CLASS_NUM - 1, (1, )).astype('int64')
        return image, label

    def __len__(self):
        return self.num_samples


paddle.enable_static()

dataset = RandomDataset(BATCH_NUM * BATCH_SIZE)

loader = DataLoader(dataset,
                    batch_size=BATCH_SIZE,
                    shuffle=True,
                    drop_last=True,
                    num_workers=2)

for e in range(EPOCH_NUM):
    for i, (image, label) in enumerate(loader()):
        print(type(image))
        print(image.__array__())
예제 #27
0
def do_train(args):
    paddle.set_device("gpu" if args.n_gpu else "cpu")
    if paddle.distributed.get_world_size() > 1:
        paddle.distributed.init_parallel_env()

    set_seed(args)

    args.task_name = args.task_name.lower()
    dataset_class, metric_class = TASK_CLASSES[args.task_name]
    args.model_type = args.model_type.lower()
    model_class, tokenizer_class = MODEL_CLASSES[args.model_type]

    train_dataset = dataset_class.get_datasets(["train"])
    tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path)

    trans_func = partial(convert_example,
                         tokenizer=tokenizer,
                         label_list=train_dataset.get_labels(),
                         max_seq_length=args.max_seq_length)
    train_dataset = train_dataset.apply(trans_func, lazy=True)
    train_batch_sampler = paddle.io.DistributedBatchSampler(
        train_dataset, batch_size=args.batch_size, shuffle=True)
    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=tokenizer.pad_token_id),  # input
        Pad(axis=0, pad_val=tokenizer.pad_token_id),  # segment
        Stack(),  # length
        Stack(dtype="int64"
              if train_dataset.get_labels() else "float32")  # label
    ): [data for i, data in enumerate(fn(samples)) if i != 2]
    train_data_loader = DataLoader(dataset=train_dataset,
                                   batch_sampler=train_batch_sampler,
                                   collate_fn=batchify_fn,
                                   num_workers=0,
                                   return_list=True)
    if args.task_name == "mnli":
        dev_dataset_matched, dev_dataset_mismatched = dataset_class.get_datasets(
            ["dev_matched", "dev_mismatched"])
        dev_dataset_matched = dev_dataset_matched.apply(trans_func, lazy=True)
        dev_dataset_mismatched = dev_dataset_mismatched.apply(trans_func,
                                                              lazy=True)
        dev_batch_sampler_matched = paddle.io.BatchSampler(
            dev_dataset_matched, batch_size=args.batch_size, shuffle=False)
        dev_data_loader_matched = DataLoader(
            dataset=dev_dataset_matched,
            batch_sampler=dev_batch_sampler_matched,
            collate_fn=batchify_fn,
            num_workers=0,
            return_list=True)
        dev_batch_sampler_mismatched = paddle.io.BatchSampler(
            dev_dataset_mismatched, batch_size=args.batch_size, shuffle=False)
        dev_data_loader_mismatched = DataLoader(
            dataset=dev_dataset_mismatched,
            batch_sampler=dev_batch_sampler_mismatched,
            collate_fn=batchify_fn,
            num_workers=0,
            return_list=True)
    else:
        dev_dataset = dataset_class.get_datasets(["dev"])
        dev_dataset = dev_dataset.apply(trans_func, lazy=True)
        dev_batch_sampler = paddle.io.BatchSampler(dev_dataset,
                                                   batch_size=args.batch_size,
                                                   shuffle=False)
        dev_data_loader = DataLoader(dataset=dev_dataset,
                                     batch_sampler=dev_batch_sampler,
                                     collate_fn=batchify_fn,
                                     num_workers=0,
                                     return_list=True)

    num_classes = 1 if train_dataset.get_labels() == None else len(
        train_dataset.get_labels())
    model = model_class.from_pretrained(args.model_name_or_path,
                                        num_classes=num_classes)
    if paddle.distributed.get_world_size() > 1:
        model = paddle.DataParallel(model)

    num_training_steps = args.max_steps if args.max_steps > 0 else (
        len(train_data_loader) * args.num_train_epochs)
    warmup_steps = args.warmup_steps if args.warmup_steps > 0 else (int(
        math.floor(num_training_steps * args.warmup_proportion)))
    lr_scheduler = paddle.optimizer.lr.LambdaDecay(
        args.learning_rate,
        lambda current_step, num_warmup_steps=warmup_steps, num_training_steps=
        num_training_steps: float(current_step) / float(
            max(1, num_warmup_steps))
        if current_step < num_warmup_steps else max(
            0.0,
            float(num_training_steps - current_step) / float(
                max(1, num_training_steps - num_warmup_steps))))

    optimizer = paddle.optimizer.AdamW(
        learning_rate=lr_scheduler,
        beta1=0.9,
        beta2=0.999,
        epsilon=args.adam_epsilon,
        parameters=model.parameters(),
        weight_decay=args.weight_decay,
        apply_decay_param_fun=lambda x: x in [
            p.name for n, p in model.named_parameters()
            if not any(nd in n for nd in ["bias", "norm"])
        ])

    loss_fct = paddle.nn.loss.CrossEntropyLoss() if train_dataset.get_labels(
    ) else paddle.nn.loss.MSELoss()

    metric = metric_class()

    global_step = 0
    tic_train = time.time()
    for epoch in range(args.num_train_epochs):
        for step, batch in enumerate(train_data_loader):
            global_step += 1
            input_ids, segment_ids, labels = batch
            logits = model(input_ids, segment_ids)
            loss = loss_fct(logits, labels)
            loss.backward()
            optimizer.step()
            lr_scheduler.step()
            optimizer.clear_gradients()
            if global_step % args.logging_steps == 0:
                logger.info(
                    "global step %d/%d, epoch: %d, batch: %d, rank_id: %s, loss: %f, lr: %.10f, speed: %.4f step/s"
                    % (global_step, num_training_steps, epoch, step,
                       paddle.distributed.get_rank(), loss, optimizer.get_lr(),
                       args.logging_steps / (time.time() - tic_train)))
                tic_train = time.time()
            if global_step % args.save_steps == 0:
                tic_eval = time.time()
                if args.task_name == "mnli":
                    evaluate(model, loss_fct, metric, dev_data_loader_matched)
                    evaluate(model, loss_fct, metric,
                             dev_data_loader_mismatched)
                    logger.info("eval done total : %s s" %
                                (time.time() - tic_eval))
                else:
                    evaluate(model, loss_fct, metric, dev_data_loader)
                    logger.info("eval done total : %s s" %
                                (time.time() - tic_eval))
                if (not args.n_gpu > 1) or paddle.distributed.get_rank() == 0:
                    output_dir = os.path.join(
                        args.output_dir, "%s_ft_model_%d.pdparams" %
                        (args.task_name, global_step))
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                    # Need better way to get inner model of DataParallel
                    model_to_save = model._layers if isinstance(
                        model, paddle.DataParallel) else model
                    model_to_save.save_pretrained(output_dir)
                    tokenizer.save_pretrained(output_dir)
예제 #28
0
def do_train(args):
    paddle.set_device("gpu" if args.n_gpu else "cpu")
    if paddle.distributed.get_world_size() > 1:
        paddle.distributed.init_parallel_env()

    train_dataset, dev_dataset = ppnlp.datasets.MSRA_NER.get_datasets(
        ["train", "dev"])
    tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path)

    label_list = train_dataset.get_labels()
    label_num = len(label_list)
    no_entity_id = label_num - 1
    trans_func = partial(convert_example,
                         tokenizer=tokenizer,
                         label_list=label_list,
                         no_entity_id=label_num - 1,
                         max_seq_length=args.max_seq_length)
    train_dataset = train_dataset.apply(trans_func, lazy=True)
    train_batch_sampler = paddle.io.DistributedBatchSampler(
        train_dataset,
        batch_size=args.batch_size,
        shuffle=True,
        drop_last=True)

    ignore_label = -100
    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token]),  # input
        Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token]),  # segment
        Stack(),  # length
        Pad(axis=0, pad_val=ignore_label)  # label
    ): fn(samples)
    train_data_loader = DataLoader(dataset=train_dataset,
                                   batch_sampler=train_batch_sampler,
                                   collate_fn=batchify_fn,
                                   num_workers=0,
                                   return_list=True)

    dev_dataset = dev_dataset.apply(trans_func, lazy=True)
    dev_batch_sampler = paddle.io.BatchSampler(dev_dataset,
                                               batch_size=args.batch_size,
                                               shuffle=False,
                                               drop_last=True)
    dev_data_loader = DataLoader(dataset=dev_dataset,
                                 batch_sampler=dev_batch_sampler,
                                 collate_fn=batchify_fn,
                                 num_workers=0,
                                 return_list=True)

    model = BertForTokenClassification.from_pretrained(args.model_name_or_path,
                                                       num_classes=label_num)
    if paddle.distributed.get_world_size() > 1:
        model = paddle.DataParallel(model)

    lr_scheduler = paddle.optimizer.lr.LambdaDecay(
        args.learning_rate,
        lambda current_step, num_warmup_steps=args.warmup_steps,
        num_training_steps=args.max_steps if args.max_steps > 0 else
        (len(train_data_loader) * args.num_train_epochs): float(
            current_step) / float(max(1, num_warmup_steps))
        if current_step < num_warmup_steps else max(
            0.0,
            float(num_training_steps - current_step) / float(
                max(1, num_training_steps - num_warmup_steps))))

    optimizer = paddle.optimizer.AdamW(
        learning_rate=lr_scheduler,
        epsilon=args.adam_epsilon,
        parameters=model.parameters(),
        weight_decay=args.weight_decay,
        apply_decay_param_fun=lambda x: x in [
            p.name for n, p in model.named_parameters()
            if not any(nd in n for nd in ["bias", "norm"])
        ])

    loss_fct = paddle.nn.loss.CrossEntropyLoss(ignore_index=ignore_label)
    metric = ChunkEvaluator(int(math.ceil((label_num + 1) / 2.0)), "IOB")

    global_step = 0
    tic_train = time.time()
    for epoch in range(args.num_train_epochs):
        for step, batch in enumerate(train_data_loader):
            input_ids, segment_ids, length, labels = batch
            logits = model(input_ids, segment_ids)
            loss = loss_fct(logits.reshape([-1, label_num]),
                            labels.reshape([-1]))
            avg_loss = paddle.mean(loss)
            if global_step % args.logging_steps == 0:
                print(
                    "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s"
                    % (global_step, epoch, step, avg_loss, args.logging_steps /
                       (time.time() - tic_train)))
                tic_train = time.time()
            avg_loss.backward()
            optimizer.step()
            lr_scheduler.step()
            optimizer.clear_gradients()
            if global_step % args.save_steps == 0:
                evaluate(model, loss_fct, metric, dev_data_loader, label_num)
                if (not args.n_gpu > 1) or paddle.distributed.get_rank() == 0:
                    paddle.save(
                        model.state_dict(),
                        os.path.join(args.output_dir,
                                     "model_%d.pdparams" % global_step))
            global_step += 1
예제 #29
0
def do_train(args):
    paddle.set_device("gpu" if args.n_gpu else "cpu")
    if paddle.distributed.get_world_size() > 1:
        paddle.distributed.init_parallel_env()

    set_seed(args)

    args.task_name = args.task_name.lower()
    dataset_class, metric_class = TASK_CLASSES[args.task_name]
    args.model_type = args.model_type.lower()
    model_class, tokenizer_class = MODEL_CLASSES[args.model_type]

    train_ds = dataset_class.get_datasets(['train'])

    tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path)
    trans_func = partial(convert_example,
                         tokenizer=tokenizer,
                         label_list=train_ds.get_labels(),
                         max_seq_length=args.max_seq_length)
    train_ds = train_ds.apply(trans_func, lazy=True)
    train_batch_sampler = paddle.io.DistributedBatchSampler(
        train_ds, batch_size=args.batch_size, shuffle=True)
    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=tokenizer.pad_token_id),  # input
        Pad(axis=0, pad_val=tokenizer.pad_token_id),  # segment
        Stack(),  # length
        Stack(dtype="int64" if train_ds.get_labels() else "float32")  # label
    ): [data for i, data in enumerate(fn(samples)) if i != 2]
    train_data_loader = DataLoader(dataset=train_ds,
                                   batch_sampler=train_batch_sampler,
                                   collate_fn=batchify_fn,
                                   num_workers=0,
                                   return_list=True)
    if args.task_name == "mnli":
        dev_dataset_matched, dev_dataset_mismatched = dataset_class.get_datasets(
            ["dev_matched", "dev_mismatched"])
        dev_dataset_matched = dev_dataset_matched.apply(trans_func, lazy=True)
        dev_dataset_mismatched = dev_dataset_mismatched.apply(trans_func,
                                                              lazy=True)
        dev_batch_sampler_matched = paddle.io.BatchSampler(
            dev_dataset_matched, batch_size=args.batch_size, shuffle=False)
        dev_data_loader_matched = DataLoader(
            dataset=dev_dataset_matched,
            batch_sampler=dev_batch_sampler_matched,
            collate_fn=batchify_fn,
            num_workers=0,
            return_list=True)
        dev_batch_sampler_mismatched = paddle.io.BatchSampler(
            dev_dataset_mismatched, batch_size=args.batch_size, shuffle=False)
        dev_data_loader_mismatched = DataLoader(
            dataset=dev_dataset_mismatched,
            batch_sampler=dev_batch_sampler_mismatched,
            collate_fn=batchify_fn,
            num_workers=0,
            return_list=True)
    else:
        dev_dataset = dataset_class.get_datasets(["dev"])
        dev_dataset = dev_dataset.apply(trans_func, lazy=True)
        dev_batch_sampler = paddle.io.BatchSampler(dev_dataset,
                                                   batch_size=args.batch_size,
                                                   shuffle=False)
        dev_data_loader = DataLoader(dataset=dev_dataset,
                                     batch_sampler=dev_batch_sampler,
                                     collate_fn=batchify_fn,
                                     num_workers=0,
                                     return_list=True)

    num_labels = 1 if train_ds.get_labels() == None else len(
        train_ds.get_labels())

    model = model_class.from_pretrained(args.model_name_or_path,
                                        num_classes=num_labels)
    if paddle.distributed.get_world_size() > 1:
        model = paddle.DataParallel(model)

    # Step1: Initialize a dictionary to save the weights from the origin BERT model.
    origin_weights = {}
    for name, param in model.named_parameters():
        origin_weights[name] = param

    # Step2: Convert origin model to supernet.
    sp_config = supernet(expand_ratio=args.width_mult_list)
    model = Convert(sp_config).convert(model)
    # Use weights saved in the dictionary to initialize supernet.
    utils.set_state_dict(model, origin_weights)
    del origin_weights

    # Step3: Define teacher model.
    teacher_model = model_class.from_pretrained(args.model_name_or_path,
                                                num_classes=num_labels)

    # Step4: Config about distillation.
    mapping_layers = ['bert.embeddings']
    for idx in range(model.bert.config['num_hidden_layers']):
        mapping_layers.append('bert.encoder.layers.{}'.format(idx))

    default_distill_config = {
        'lambda_distill': 0.1,
        'teacher_model': teacher_model,
        'mapping_layers': mapping_layers,
    }
    distill_config = DistillConfig(**default_distill_config)

    # Step5: Config in supernet training.
    ofa_model = OFA(model,
                    distill_config=distill_config,
                    elastic_order=['width'])

    criterion = paddle.nn.loss.CrossEntropyLoss() if train_ds.get_labels(
    ) else paddle.nn.loss.MSELoss()

    metric = metric_class()

    if args.task_name == "mnli":
        dev_data_loader = (dev_data_loader_matched, dev_data_loader_mismatched)

    # Step6: Calculate the importance of neurons and head,
    # and then reorder them according to the importance.
    head_importance, neuron_importance = utils.compute_neuron_head_importance(
        args.task_name,
        ofa_model.model,
        dev_data_loader,
        loss_fct=criterion,
        num_layers=model.bert.config['num_hidden_layers'],
        num_heads=model.bert.config['num_attention_heads'])
    reorder_neuron_head(ofa_model.model, head_importance, neuron_importance)

    num_training_steps = args.max_steps if args.max_steps > 0 else len(
        train_data_loader) * args.num_train_epochs

    lr_scheduler = LinearDecayWithWarmup(args.learning_rate,
                                         num_training_steps, args.warmup_steps)

    optimizer = paddle.optimizer.AdamW(
        learning_rate=lr_scheduler,
        epsilon=args.adam_epsilon,
        parameters=ofa_model.model.parameters(),
        weight_decay=args.weight_decay,
        apply_decay_param_fun=lambda x: x in [
            p.name for n, p in ofa_model.model.named_parameters()
            if not any(nd in n for nd in ["bias", "norm"])
        ])

    global_step = 0
    tic_train = time.time()
    for epoch in range(args.num_train_epochs):
        # Step7: Set current epoch and task.
        ofa_model.set_epoch(epoch)
        ofa_model.set_task('width')

        for step, batch in enumerate(train_data_loader):
            global_step += 1
            input_ids, segment_ids, labels = batch

            for width_mult in args.width_mult_list:
                # Step8: Broadcast supernet config from width_mult,
                # and use this config in supernet training.
                net_config = apply_config(ofa_model, width_mult)
                ofa_model.set_net_config(net_config)
                logits, teacher_logits = ofa_model(input_ids,
                                                   segment_ids,
                                                   attention_mask=[None, None])
                rep_loss = ofa_model.calc_distill_loss()
                if args.task_name == 'sts-b':
                    logit_loss = 0.0
                else:
                    logit_loss = soft_cross_entropy(logits,
                                                    teacher_logits.detach())
                loss = rep_loss + args.lambda_logit * logit_loss
                loss.backward()
            optimizer.step()
            lr_scheduler.step()
            ofa_model.model.clear_gradients()

            if global_step % args.logging_steps == 0:
                if (not args.n_gpu > 1) or paddle.distributed.get_rank() == 0:
                    logger.info(
                        "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s"
                        % (global_step, epoch, step, loss, args.logging_steps /
                           (time.time() - tic_train)))
                tic_train = time.time()

            if global_step % args.save_steps == 0:
                if args.task_name == "mnli":
                    evaluate(teacher_model,
                             criterion,
                             metric,
                             dev_data_loader_matched,
                             width_mult=100)
                    evaluate(teacher_model,
                             criterion,
                             metric,
                             dev_data_loader_mismatched,
                             width_mult=100)
                else:
                    evaluate(teacher_model,
                             criterion,
                             metric,
                             dev_data_loader,
                             width_mult=100)
                for idx, width_mult in enumerate(args.width_mult_list):
                    net_config = apply_config(ofa_model, width_mult)
                    ofa_model.set_net_config(net_config)
                    tic_eval = time.time()
                    if args.task_name == "mnli":
                        acc = evaluate(ofa_model, criterion, metric,
                                       dev_data_loader_matched, width_mult)
                        evaluate(ofa_model, criterion, metric,
                                 dev_data_loader_mismatched, width_mult)
                        print("eval done total : %s s" %
                              (time.time() - tic_eval))
                    else:
                        acc = evaluate(ofa_model, criterion, metric,
                                       dev_data_loader, width_mult)
                        print("eval done total : %s s" %
                              (time.time() - tic_eval))

                    if (not args.n_gpu > 1
                        ) or paddle.distributed.get_rank() == 0:
                        output_dir = os.path.join(args.output_dir,
                                                  "model_%d" % global_step)
                        if not os.path.exists(output_dir):
                            os.makedirs(output_dir)
                        # need better way to get inner model of DataParallel
                        model_to_save = model._layers if isinstance(
                            model, paddle.DataParallel) else model
                        model_to_save.save_pretrained(output_dir)
                        tokenizer.save_pretrained(output_dir)
예제 #30
0
def main_worker(gpu, ngpus_per_node, args):
    args.gpu = dist.get_rank()  # None
    logger = get_logger('dcq',
                        log_file='{}/workerlog.{}'.format(args.save, args.gpu),
                        level='info',
                        rank=args.gpu)

    # suppress printing if not master
    if args.distributed and args.gpu != 0:

        def print_pass(*args):
            pass

        builtins.print = print_pass

    if args.gpu is not None:
        logger.info("Use GPU: {} for training".format(args.gpu))

    if args.distributed:
        dist.init_parallel_env()

    # create model
    logger.info("=> creating model '{}'".format(args.arch))
    if args.arch in models.__dict__.keys():
        backbone = models.__dict__[args.arch]
    else:
        raise NotImplementedError

    model = DCQ(
        backbone,
        args.feat_dim,
        args.queue_size,
        args.dcq_momentum,
        args.scale,
        args.margin,
    )

    if args.distributed:
        model = paddle.DataParallel(model)

    criterion = paddle.nn.loss.CrossEntropyLoss(reduction='mean')
    optimizer = paddle.optimizer.Momentum(learning_rate=args.lr,
                                          momentum=args.momentum,
                                          weight_decay=args.weight_decay,
                                          parameters=model.parameters())

    if args.resume:
        if os.path.isfile(args.resume + '.pdparams'):
            print("=> loading checkpoint '{}'".format(args.resume))
            with open(args.resume + '.state.pickle', 'rb') as fin:
                state = pickle.load(fin)
            args.start_epoch = state['epoch']

            state_dict = paddle.load(args.resume + '.pdparams')
            print(model.set_state_dict(state_dict))

            optimizer_state = paddle.load(args.resume + '.pdopt')
            optimizer.set_state_dict(optimizer_state)
            print("=> loaded checkpoint '{}' (epoch {})".format(
                args.resume, state['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    # Data loading code
    augmentation = build_aug(args)

    if args.filelist is not None:
        roots = args.data.split(';')
        anno_files = args.filelist.split(';')
        probs = args.dataprob.split(';')
        assert len(roots) == len(anno_files)
        assert len(probs) == len(anno_files)
        datasets = []
        for root, anno_file in zip(roots, anno_files):
            datasets.append(ImageDataset(root=root, anno_file=anno_file))
        probs = [float(v) for v in probs]
    else:
        raise NotImplementedError

    data_processing = Processing(transform=augmentation)
    train_dataset = Sampler(datasets,
                            probs,
                            samples_per_epoch=args.iter_per_epoch *
                            args.batch_size,
                            processing=data_processing,
                            k=2,
                            sampling_base=args.sampling_base)

    if args.sampling_base == 'image':
        train_sampler = paddle.io.DistributedBatchSampler(train_dataset,
                                                          args.batch_size,
                                                          shuffle=True,
                                                          drop_last=True)
        train_loader = DataLoader(train_dataset,
                                  num_workers=1,
                                  batch_sampler=train_sampler)
    else:
        train_loader = DataLoader(train_dataset,
                                  batch_size=args.batch_size,
                                  shuffle=True,
                                  num_workers=1,
                                  drop_last=True)
    print(
        f'{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())}: DataLoader is ready.'
    )

    for epoch in range(args.start_epoch, args.epochs):
        adjust_learning_rate(optimizer, epoch, args)

        # train for one epoch
        train(train_loader, model, criterion, optimizer, epoch, args, logger)
        if args.gpu == 0 and epoch > args.schedule[0]:
            # skip saving the queue
            state_dict = {}
            model_state_dict = model.state_dict()
            for key in model_state_dict:
                # we don't need to save the queue
                if 'queue' not in key:
                    state_dict[key] = model_state_dict[key]

            save_checkpoint(
                {
                    'epoch': epoch + 1,
                    'arch': args.arch,
                    'state_dict': state_dict,
                    'optimizer': optimizer.state_dict(),
                },
                filename='{}/face_checkpoint_{:04d}'.format(args.save, epoch))