예제 #1
0
def evaluate(model, ds, all_examples, all_features, tokenizer, args):
    dev_file = json.loads(open(args.dev_file, encoding='utf8').read())
    with P.no_grad():
        log.debug('start eval')
        model.eval()
        all_res = []
        for step, (uids, token_ids, token_type_ids, _, __) in enumerate(
                P.io.DataLoader(ds,
                                places=P.CUDAPlace(env.dev_id),
                                batch_size=None)):
            _, start_logits, end_logits = model(token_ids, token_type_ids)
            res = [
                mrc_metrics.RawResult(unique_id=u,
                                      start_logits=s,
                                      end_logits=e)
                for u, s, e in zip(uids.numpy(), start_logits.numpy(),
                                   end_logits.numpy())
            ]
            all_res += res
        open('all_res', 'wb').write(pickle.dumps(all_res))
        all_pred, all_nbests = mrc_metrics.make_results(
            tokenizer,
            all_examples,
            all_features,
            all_res,
            n_best_size=args.n_best_size,
            max_answer_length=args.max_answer_length,
            do_lower_case=tokenizer.lower)
        f1, em, _, __ = mrc_metrics.evaluate(dev_file, all_pred)
        model.train()
        log.debug('done eval')
        return f1, em
예제 #2
0
def evaluate(model, ds, all_examples, all_features, tokenizer, args):
    dev_file = json.loads(open(args.dev_file).read())
    with D.base._switch_tracer_mode_guard_(is_train=False):
        log.debug('start eval')
        model.eval()
        all_res = []
        for step, (uids, token_ids, token_type_ids, _,
                   __) in enumerate(ds.start(place)):
            _, start_logits, end_logits = model(token_ids, token_type_ids)
            res = [
                mrc_metrics.RawResult(unique_id=u,
                                      start_logits=s,
                                      end_logits=e)
                for u, s, e in zip(uids.numpy(), start_logits.numpy(),
                                   end_logits.numpy())
            ]
            all_res += res
        open('all_res', 'wb').write(pickle.dumps(all_res))
        all_pred, all_nbests = mrc_metrics.make_results(
            tokenizer,
            all_examples,
            all_features,
            all_res,
            n_best_size=args.n_best_size,
            max_answer_length=args.max_answer_length,
            do_lower_case=tokenizer.lower)
        f1, em, _, __ = mrc_metrics.evaluate(dev_file, all_pred)
        model.train()
        log.debug('done eval')
        return f1, em
예제 #3
0
def build_bb(from_file, to_file):
    slots = []
    for i, line in enumerate(from_file):
        line = line.strip()
        if args.verbose and i % 10000 == 0:
            log.debug(i)
        if len(line) == 0:
            if len(slots) != 0:
                transposed_slots = list(zip(*slots))
                ex = build_example(transposed_slots)
                write_gz(ex.SerializeToString(), to_file)
                slots = []
            continue
        parsed_line = parse_txt(line)
        slots.append(parsed_line)

    if len(slots) != 0:
        transposed_slots = list(zip(*slots))
        ex = build_example(transposed_slots)
        write_gz(ex.SerializeToString(), to_file)
        slots = []
예제 #4
0
def train(model, train_dataset, dev_dataset, dev_examples, dev_features, tokenizer, args):
    ctx = D.parallel.prepare_context()
    model = D.parallel.DataParallel(model, ctx)

    max_steps = len(train_features) * args.epoch // args.bsz
    opt = AdamW(learning_rate=args.lr, parameter_list=model.parameters(), weight_decay=args.wd)
    g_clip = F.clip.GradientClipByGlobalNorm(1.0) #experimental

    train_dataset = train_dataset \
            .repeat() \
            .shard(D.parallel.Env().nranks, D.parallel.Env().dev_id) \
            .shuffle(1000) \
            .padded_batch(args.bsz) 

    log.debug('init training with args: %s' % repr(args))
    for step, (_, token_ids, token_type_ids, start_pos, end_pos) in enumerate(train_dataset.start(place)):
        loss, _, __ = model(token_ids, token_type_ids, start_pos=start_pos, end_pos=end_pos)
        scaled_loss = model.scale_loss(loss)
        scaled_loss.backward()
        model.apply_collective_grads()
        opt.minimize(scaled_loss, grad_clip=g_clip)
        model.clear_gradients()
        if D.parallel.Env().dev_id == 0 and step % 10 == 0:
            log.debug('[step %d] train loss %.5f lr %.3e' % (step, loss.numpy(), opt.current_step_lr()))
        if D.parallel.Env().dev_id == 0 and step % 100 == 0:
            f1, em = evaluate(model, dev_dataset, dev_examples, dev_features, tokenizer, args)
            log.debug('[step %d] eval result: f1 %.5f em %.5f' % (step, f1, em))
        if step > max_steps:
            break
예제 #5
0
 def tokenizer(sen):
     log.debug(sen)
     return sen.split(b' ')
예제 #6
0
파일: optimization.py 프로젝트: Yelrose/PGL
def optimization(loss,
                 warmup_steps,
                 num_train_steps,
                 learning_rate,
                 train_program,
                 startup_prog,
                 weight_decay,
                 scheduler='linear_warmup_decay',
                 use_fp16=False,
                 use_lamb=False,
                 use_dynamic_loss_scaling=False,
                 init_loss_scaling=1.0,
                 incr_every_n_steps=1000,
                 decr_every_n_nan_or_inf=2,
                 incr_ratio=2.0,
                 decr_ratio=0.8,
                 layer_decay_rate=0.0,
                 n_layers=12):
    def exclude_from_weight_decay(param):
        name = param.name.rstrip('.master')
        if name.find("layer_norm") > -1:
            return True
        bias_suffix = ["_bias", "_b", ".b_0"]
        for suffix in bias_suffix:
            if name.endswith(suffix):
                return True
        return False

    if warmup_steps > 0:
        if scheduler == 'noam_decay':
            scheduled_lr = fluid.layers.learning_rate_scheduler\
             .noam_decay(1/(warmup_steps *(learning_rate ** 2)),
                         warmup_steps)
        elif scheduler == 'linear_warmup_decay':
            scheduled_lr = linear_warmup_decay(learning_rate, warmup_steps,
                                               num_train_steps)
        else:
            raise ValueError("Unkown learning rate scheduler, should be "
                             "'noam_decay' or 'linear_warmup_decay'")
        if not use_lamb:
            log.debug('using Adam')
            optimizer = fluid.optimizer.Adam(learning_rate=scheduled_lr)
        else:
            log.debug('using Lamb')
            optimizer = fluid.optimizer.Lamb(
                learning_rate=scheduled_lr,
                lamb_weight_decay=weight_decay,
                exclude_from_weight_decay_fn=exclude_from_weight_decay)
    else:
        scheduled_lr = fluid.layers.create_global_var(
            name=fluid.unique_name.generate("learning_rate"),
            shape=[1],
            value=learning_rate,
            dtype='float32',
            persistable=True)
        if not use_lamb:
            log.debug('using Adam')
            optimizer = fluid.optimizer.Adam(learning_rate=scheduled_lr)
        else:
            log.debug('using Lamb')
            optimizer = fluid.optimizer.Lamb(
                learning_rate=scheduled_lr,
                lamb_weight_decay=weight_decay,
                exclude_from_weight_decay_fn=exclude_from_weight_decay)
        optimizer._learning_rate_map[fluid.default_main_program(
        )] = scheduled_lr

    fluid.clip.set_gradient_clip(
        clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0))

    param_list = dict()

    loss_scaling = fluid.layers.create_global_var(
        name=fluid.unique_name.generate("loss_scaling"),
        shape=[1],
        value=init_loss_scaling,
        dtype='float32',
        persistable=True)

    if use_fp16:
        from utils.fp16 import create_master_params_grads, master_param_to_train_param, apply_dynamic_loss_scaling
        loss *= loss_scaling
        param_grads = optimizer.backward(loss)

        master_param_grads = create_master_params_grads(
            param_grads, train_program, startup_prog, loss_scaling)

        for param, _ in master_param_grads:
            param_list[param.name] = param * 1.0
            param_list[param.name].stop_gradient = True

        if use_dynamic_loss_scaling:
            apply_dynamic_loss_scaling(
                loss_scaling, master_param_grads, incr_every_n_steps,
                decr_every_n_nan_or_inf, incr_ratio, decr_ratio)

        optimizer.apply_gradients(master_param_grads)

        if not use_lamb and weight_decay > 0:
            for param, grad in master_param_grads:
                if exclude_from_weight_decay(param):
                    continue
                with param.block.program._optimized_guard(
                    [param, grad]), fluid.framework.name_scope("weight_decay"):
                    updated_param = param - param_list[
                        param.name] * weight_decay * scheduled_lr
                    fluid.layers.assign(output=param, input=updated_param)

        master_param_to_train_param(master_param_grads, param_grads,
                                    train_program)

    else:
        for param in train_program.global_block().all_parameters():
            param_list[param.name] = param * 1.0
            param_list[param.name].stop_gradient = True

        _, param_grads = optimizer.minimize(loss)
        if layer_decay_rate > 0:
            for param, grad in param_grads:
                with param.block.program._optimized_guard(
                    [param, grad]), fluid.framework.name_scope("layer_decay"):
                    param_decay = layer_decay(param, param_list[param.name],
                                              scheduled_lr, layer_decay_rate,
                                              n_layers)
                    if param_decay:
                        fluid.layers.assign(output=param, input=param_decay)

        if not use_lamb and weight_decay > 0:
            for param, grad in param_grads:
                if exclude_from_weight_decay(param):
                    continue
                with param.block.program._optimized_guard(
                    [param, grad]), fluid.framework.name_scope("weight_decay"):
                    updated_param = param - param_list[
                        param.name] * weight_decay * scheduled_lr
                    fluid.layers.assign(output=param, input=updated_param)

    return scheduled_lr, loss_scaling
예제 #7
0
                    loss.backward()
                    scaler.minimize(opt, loss)
                    model.clear_gradients()
                    lr_scheduler.step()

                    if step % 10 == 0:
                        _lr = lr_scheduler.get_lr()
                        if args.use_amp:
                            _l = (loss / scaler._scale).numpy()
                            msg = '[step-%d] train loss %.5f lr %.3e scaling %.3e' % (
                                step, _l, _lr, scaler._scale.numpy())
                        else:
                            _l = loss.numpy()
                            msg = '[step-%d] train loss %.5f lr %.3e' % (
                                step, _l, _lr)
                        log.debug(msg)
                        log_writer.add_scalar('loss', _l, step=step)
                        log_writer.add_scalar('lr', _lr, step=step)

                    if step % 100 == 0:
                        acc = []
                        with P.no_grad():
                            model.eval()
                            for step, d in enumerate(
                                    P.io.DataLoader(dev_ds,
                                                    places=P.CUDAPlace(0),
                                                    batch_size=None)):
                                ids, sids, label = d
                                loss, logits = model(ids, sids, labels=label)
                                a = (logits.argmax(-1) == label)
                                acc.append(a.numpy())
예제 #8
0
            args.from_pretrained, num_labels=3, name='')

        opt = AdamW(learning_rate=LinearDecay(
            args.lr, int(args.warmup_proportion * args.max_steps),
            args.max_steps),
                    parameter_list=model.parameters(),
                    weight_decay=args.wd)
        g_clip = F.dygraph_grad_clip.GradClipByGlobalNorm(1.0)  #experimental
        for epoch in range(args.epoch):
            for step, d in enumerate(
                    tqdm(train_ds.start(place), desc='training')):
                ids, sids, label = d
                loss, _ = model(ids, sids, labels=label)
                loss.backward()
                if step % 10 == 0:
                    log.debug('train loss %.5f lr %.3e' %
                              (loss.numpy(), opt.current_step_lr()))
                opt.minimize(loss, grad_clip=g_clip)
                model.clear_gradients()
                if step % 100 == 0:
                    acc = []
                    with FD.base._switch_tracer_mode_guard_(is_train=False):
                        model.eval()
                        for step, d in enumerate(
                                tqdm(dev_ds.start(),
                                     desc='evaluating %d' % epoch)):
                            ids, sids, label = d
                            loss, logits = model(ids, sids, labels=label)
                            #print('\n'.join(map(str, logits.numpy().tolist())))
                            a = L.argmax(logits, -1) == label
                            acc.append(a.numpy())
                        model.train()
예제 #9
0
def greedy_search_infilling(model,
                            q_ids,
                            q_sids,
                            sos_id,
                            eos_id,
                            attn_id,
                            max_encode_len=640,
                            max_decode_len=100):
    model.eval()
    #log.debug(q_ids.numpy().tolist())
    _, logits, info = model(q_ids, q_sids)
    gen_ids = L.argmax(logits, -1)
    d_batch, d_seqlen = q_ids.shape
    seqlen = L.reduce_sum(L.cast(q_ids != 0, 'int64'), 1, keep_dim=True)
    log.debug(seqlen.numpy())
    log.debug(d_seqlen)
    has_stopped = np.zeros([d_batch], dtype=np.bool)
    gen_seq_len = np.zeros([d_batch], dtype=np.int64)
    output_ids = []

    past_cache = info['caches']

    cls_ids = L.ones([d_batch], dtype='int64') * sos_id
    attn_ids = L.ones([d_batch], dtype='int64') * attn_id
    ids = L.stack([cls_ids, attn_ids], -1)
    for step in range(max_decode_len):
        log.debug('decode step %d' % step)
        bias = gen_bias(q_ids, ids, step)
        pos_ids = D.to_variable(
            np.tile(np.array([[step, step + 1]], dtype=np.int64),
                    [d_batch, 1]))
        pos_ids += seqlen
        _, logits, info = model(ids,
                                L.ones_like(ids) * 3,
                                pos_ids=pos_ids,
                                attn_bias=bias,
                                past_cache=past_cache)
        gen_ids = L.argmax(logits, -1)

        past_cached_k, past_cached_v = past_cache
        cached_k, cached_v = info['caches']
        cached_k = [
            L.concat([pk, k[:, :1, :]], 1)
            for pk, k in zip(past_cached_k, cached_k)
        ]  # concat cached
        cached_v = [
            L.concat([pv, v[:, :1, :]], 1)
            for pv, v in zip(past_cached_v, cached_v)
        ]
        past_cache = (cached_k, cached_v)

        gen_ids = gen_ids[:, 1]
        ids = L.stack([gen_ids, attn_ids], 1)

        gen_ids = gen_ids.numpy()
        has_stopped |= (gen_ids == eos_id).astype(np.bool)
        gen_seq_len += (1 - has_stopped.astype(np.int64))
        output_ids.append(gen_ids.tolist())
        if has_stopped.all():
            #log.debug('exit because all done')
            break
        #if step == 1: break
    output_ids = np.array(output_ids).transpose([1, 0])
    return output_ids
예제 #10
0
def create_model(args, phase, micro_bsz, dp_sharding_rank, dp_worldsize, topo):
    if args.use_sop:
        from reader.pretraining_ds_ernie_full_sent import make_pretrain_dataset
    else:
        from reader.pretraining_ds_mlm import make_pretrain_dataset

    # mask_label, mask_pos for mlm, labels for sop
    if args.use_sop:
        input_fields = {
            'names':
            ['src_ids', 'sent_ids', 'mask_label', 'mask_pos', 'labels'],
            'shapes': [[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1],
                       [-1, 1], [-1, 1], [-1, 1]],
            'dtypes': ['int64', 'int64', 'int64', 'int64', 'int64'],
            'lod_levels': [0, 0, 0, 0, 0],
        }
    else:
        input_fields = {
            'names': ['src_ids', 'sent_ids', 'mask_label', 'mask_pos'],
            'shapes': [[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1],
                       [-1, 1], [-1, 1]],
            'dtypes': ['int64', 'int64', 'int64', 'int64'],
            'lod_levels': [0, 0, 0, 0],
        }

    with fluid.device_guard("gpu:0"):
        inputs = [
            fluid.data(name=input_fields['names'][i],
                       shape=input_fields['shapes'][i],
                       dtype=input_fields['dtypes'][i],
                       lod_level=input_fields['lod_levels'][i])
            for i in range(len(input_fields['names']))
        ]
    if args.use_sop:
        (src_ids, sent_ids, mask_label, mask_pos, labels) = inputs
    else:
        (src_ids, sent_ids, mask_label, mask_pos) = inputs
    train_file_list = glob.glob(args.data_dir + "/*")
    vocab = {}
    with open(args.vocab_file) as r:
        for line in r:
            lines = line.strip().split('\t')
            vocab[lines[0]] = int(lines[1])

    log.debug("========= worker: {} of {} ==========".format(
        dp_sharding_rank, dp_worldsize))

    data_reader = make_pretrain_dataset('pt', train_file_list, True, vocab,
                                        micro_bsz, len(vocab),
                                        args.max_seq_len, dp_sharding_rank,
                                        dp_worldsize)
    with fluid.device_guard("gpu:0"):
        data_loader = fluid.io.DataLoader.from_generator(feed_list=inputs,
                                                         capacity=70,
                                                         iterable=False)
    places = fluid.CUDAPlace(int(os.environ.get('FLAGS_selected_gpus', 0)))

    def data_gen():
        yield from data_reader

    data_loader.set_batch_generator(data_gen, places)

    ernie_config = ErnieConfig(args.ernie_config_file)._config_dict
    ernie_config["preln"] = args.preln

    weight_sharing = (topo.mp.size == 1 and topo.pp.size == 1
                      )  # pp mp should not do weight sharing
    with fluid.device_guard("gpu:0"):
        ernie = ErnieModel(src_ids,
                           sent_ids,
                           ernie_config,
                           weight_sharing=weight_sharing,
                           topo=topo)
    checkpoints = ernie._checkpoints
    checkpoints.pop(-1)

    with fluid.device_guard(f'gpu:{args.num_pp-1}'):
        mask_lm_loss, mean_mask_lm_loss = ernie.get_lm_output(
            mask_label, mask_pos)
        total_loss = mean_mask_lm_loss

        if args.use_sop:
            sop_acc, mean_sop_loss = ernie.get_next_sentence_output(labels)
            total_loss += mean_sop_loss

        if topo.pp.size > 1:
            mask_lm_loss.persistable = True
            mean_mask_lm_loss.persistable = True
            # checkpoints.extend([mask_lm_loss.name, mean_mask_lm_loss.name])
            if args.use_sop:
                mean_sop_loss.persistable = True
                sop_acc.persistable = True
                # checkpoints.extend([mean_sop_loss.name, sop_acc.name])
            total_loss.persistable = True
            # checkpoints.append(total_loss.name)

    if args.use_sop:
        graph_vars = {
            'data_loader': data_loader,
            'mask_lm_loss': mask_lm_loss,
            'mean_mask_lm_loss': mean_mask_lm_loss,
            'sop_loss': mean_sop_loss,
            'sop_acc': sop_acc,
            'total_loss': total_loss,
            'checkpoints': checkpoints
        }
    else:
        graph_vars = {
            'data_loader': data_loader,
            'mask_lm_loss': mask_lm_loss,
            'mean_mask_lm_loss': mean_mask_lm_loss,
            'total_loss': total_loss,
            'checkpoints': checkpoints,
        }
    return graph_vars
예제 #11
0
def train(args):
    log.info("pretraining start")
    profile = False

    place = fluid.CUDAPlace(int(os.environ.get('FLAGS_selected_gpus', 0)))

    # set seed
    random.seed(args.seed)
    np.random.seed(args.seed)
    paddle.seed(args.seed)
    get_rng_state_tracker().add('global_seed', args.seed)
    get_rng_state_tracker().add('local_seed',
                                args.seed + fleet.worker_index() + 2021)

    # define execution strategy
    exec_strategy = fluid.ExecutionStrategy()
    exec_strategy.num_threads = 2
    exec_strategy.num_iteration_per_drop_scope = 1

    # define distribution strategy
    dist_strategy = fleet.DistributedStrategy()
    dist_strategy.execution_strategy = exec_strategy
    dist_strategy.nccl_comm_num = 3
    if args.use_recompute:
        log.info("using recompute.")
    dist_strategy.recompute = args.use_recompute
    dist_strategy.sharding = args.use_sharding
    dist_strategy.pipeline = args.num_pp > 1

    # define topology structure for dp/pp/mp
    topo = Topology(rank=fleet.worker_index(),
                    world_size=fleet.worker_num(),
                    dp=args.num_dp,
                    pp=args.num_pp,
                    sharding=args.num_sharding,
                    mp=args.num_mp)

    is_last = False
    if topo.pp.rank == (topo.pp.size - 1):
        is_last = True

    dp_sharding_rank = topo.dp.rank * topo.sharding.size + topo.sharding.rank
    dp_worldsize = topo.dp.size * topo.sharding.size
    bsz_per_dp = args.global_bsz // dp_worldsize

    micro_bsz = args.micro_bsz
    assert args.global_bsz % micro_bsz == 0, f"cannot do gradient accumulate, globa_bsz: {args.bsz} micro_bsz: {micro_bsz}"
    acc_steps = bsz_per_dp // micro_bsz

    # sharding \ model parallel \ pipeline
    assert dist_strategy.sharding == True
    dist_strategy.sharding_configs = {
        "segment_broadcast_MB": 32,
        "sharding_degree": args.num_sharding,
        "mp_degree": args.num_mp,
        "pp_degree": args.num_pp,
        "dp_degree": args.num_dp,
        "optimize_offload": True,
    }
    dist_strategy.pipeline_configs = {
        "schedule_mode": "1F1B",
        "micro_batch_size": micro_bsz,
        "accumulate_steps": acc_steps,
    }
    log.info(
        f"using globa_bsz: {args.global_bsz} micro_bsz: {micro_bsz}, acc_steps: {acc_steps}"
    )

    dist_strategy.amp = args.use_amp
    dist_strategy.amp_configs = {
        "custom_white_list": ['softmax', 'layer_norm', 'gelu'],
        "init_loss_scaling": 32768,
        "decr_every_n_nan_or_inf": 2,
        "incr_every_n_steps": 1000,
        "incr_ratio": 2.0,
        "use_dynamic_loss_scaling": True,
        "decr_ratio": 0.5,
        "use_pure_fp16": False,
        "use_fp16_guard": False,
    }

    dist_strategy.lamb = args.use_lamb
    dist_strategy.lamb_configs = {
        'lamb_weight_decay':
        0.01,
        'exclude_from_weight_decay':
        ['layer_norm_bias', 'layer_norm_scale', '.b_0']
    }

    train_program = fluid.Program()
    startup_program = fluid.Program()
    with fluid.program_guard(train_program, startup_program):
        with fluid.unique_name.guard():
            graph_vars = create_model(args, 'train', micro_bsz,
                                      dp_sharding_rank, dp_worldsize, topo)
            data_loader = graph_vars['data_loader']
            for op in train_program.global_block().ops:
                if op.type == 'fill_constant':
                    op._set_attr(
                        'op_device', "gpu:0"
                    )  # XXX: hack: https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/layers/tensor.py#L1376

            if args.use_recompute:
                dist_strategy.recompute_configs = {
                    "checkpoints": graph_vars['checkpoints'],
                    # "enable_offload": args.use_offload,
                    # "checkpoint_shape": [micro_bsz, args.max_seq_len, 4096],
                }

            log.debug("base lr: {}".format(args.learning_rate))
            scheduled_lr = linear_warmup_decay(
                learning_rate=args.learning_rate,
                warmup_steps=args.warmup_steps,
                num_train_steps=args.num_train_steps)

            clip_norm_thres = 1.0
            if paddlenlp.ops.optimizer._jit_compile():
                optimizer = paddlenlp.ops.optimizer.AdamwOptimizer(
                    learning_rate=scheduled_lr,
                    grad_clip=fluid.clip.GradientClipByGlobalNorm(
                        clip_norm=clip_norm_thres),
                    weight_decay=args.weight_decay,
                    apply_decay_param_fun=apply_weight_decay_fun)
            else:
                optimizer = fluid.optimizer.Adam(
                    learning_rate=scheduled_lr,
                    grad_clip=fluid.clip.GradientClipByGlobalNorm(
                        clip_norm=clip_norm_thres),
                    #multi_precision=True,
                    #weight_decay=args.weight_decay, # merge this pr to use weight_decay: https://github.com/PaddlePaddle/Paddle/pull/29248
                    #exclude_from_weight_decay_fn=exclude_from_weight_decay
                )

            optimizer = fleet.distributed_optimizer(optimizer, dist_strategy)
            log.info(f"using dist strategy: {dist_strategy}")

            optimizer.minimize(graph_vars['total_loss'])

            final_strategy = fleet._final_strategy()
            applied_meta_list = fleet._get_applied_meta_list()
            log.info("final strategy: {}".format(final_strategy))
            log.info("applied_meta_list: {}".format(applied_meta_list))

    program_desc_dir = os.path.join(args.output_dir, "program_desc")
    if not os.path.isdir(program_desc_dir):
        os.mkdir(program_desc_dir)

    with open(
            program_desc_dir + "/main_program.txt.%d" %
        (int(os.environ.get('FLAGS_selected_gpus', 0))), 'w') as f:
        f.write(str(train_program))

    with open(
            program_desc_dir + "/startup_program.txt.%d" %
        (int(os.environ.get('FLAGS_selected_gpus', 0))), 'w') as f:
        f.write(str(startup_program))

    exe = fluid.Executor(place)
    exe.run(startup_program)

    optimizer.amp_init(place)

    #save_path = os.path.join(args.output_dir, 'step_0')
    #log.debug("saving models to {}".format(save_path))
    #save_persistables(exe, save_path, train_program)

    if args.init_checkpoint and args.init_checkpoint != "":
        log.info(' ')
        log.info(
            '############################WARNING############################')
        log.info(
            '####### using ini_checkpoint, not init_pretraining_params ####')
        log.info(
            '## meaning hyper param e.g. lr will inherit from checkpoint ##')
        log.info(
            '###############################################################')
        init_checkpoint(exe, args.init_checkpoint, train_program)
        log.info(' ')

    output_dir = args.output_dir
    save_steps = args.save_steps
    total_time = 0
    cost_vals, lm_losses, sop_accs = [], [], []
    global_steps = args.global_steps + 1
    steps = 0
    log_path = 'train_log/node-%d' % fleet.worker_index()
    start_time = time.time()
    with LogWriter(os.path.join(args.output_dir, log_path)) as swriter:
        data_loader.start()
        while True:
            #if steps < global_steps:
            #    steps += 1
            #    continue
            if not is_last:
                fetch_list = []
            else:
                fetch_list = [
                    graph_vars['total_loss'], graph_vars['mean_mask_lm_loss'],
                    scheduled_lr
                ]
                if args.use_sop:
                    fetch_list.extend(
                        [graph_vars['sop_acc'], graph_vars['sop_loss']])
                if args.use_amp:
                    loss_scaling = train_program.global_block(
                    ).vars['loss_scaling_0']
                    fetch_list.append(loss_scaling)

            ret = exe.run(train_program, fetch_list=fetch_list
                          )  # run one mini-batch(=acc_steps micro-batch)
            #use_program_cache=True)

            steps += 1

            if is_last:
                if args.use_sop and args.use_amp:
                    cost_val, lm_loss, lr, sop_acc, sop_loss, loss_scaling_0 = ret
                elif args.use_sop:
                    cost_val, lm_loss, lr, sop_acc, sop_loss = ret
                elif args.use_amp:
                    cost_val, lm_loss, lr, loss_scaling_0 = ret
                else:
                    cost_val, lm_loss, lr = ret
                cost_vals.append(cost_val[0])
                lm_losses.append(lm_loss[0])
                if args.use_sop:
                    sop_accs.append(sop_acc[0])

                if steps > 0 and (steps % args.log_steps) == 0:
                    end_time = time.time()
                    total_time = end_time - start_time
                    cost_val = np.mean(cost_vals)
                    lm_loss = np.mean(lm_losses)
                    swriter.add_scalar('loss/total_loss', cost_val, steps)
                    swriter.add_scalar('loss/mlm_loss', lm_loss, steps)
                    swriter.add_scalar('lr/scheduled_lr', lr[0], steps)

                    if args.use_sop:
                        sop_acc = np.mean(sop_accs)
                        swriter.add_scalar('loss/sop_loss', sop_loss, steps)
                        swriter.add_scalar('train/sop_acc', sop_acc, steps)
                    else:
                        sop_acc = 0.0

                    if args.use_amp:
                        swriter.add_scalar('lr/loss_scaling',
                                           loss_scaling_0[0], steps)
                    else:
                        loss_scaling_0 = [0.0]

                    log.info(
                        "worker_index: %d, step: %d, cost: %f, "
                        "mlm loss: %f, sentence order acc: %f, "
                        "speed: %f steps/s, "
                        "speed: %f samples/s, "
                        "speed: %f tokens/s, "
                        "learning rate: %.3e, loss_scalings: %f" %
                        (fleet.worker_index(), steps, cost_val, lm_loss,
                         sop_acc, args.log_steps / total_time,
                         args.log_steps * args.global_bsz / total_time,
                         args.log_steps * args.global_bsz * args.max_seq_len /
                         total_time, lr[0], loss_scaling_0[0]))

                    cost_vals, lm_losses, sop_accs = [], [], []
                    start_time = time.time()

            # TODO: add evaluation
            if steps > 0 and args.eval_steps > 0 and steps % args.eval_steps == 0:
                pass

            if steps > 0 and args.save_steps > 0 and steps % args.save_steps == 0:
                if args.use_hybrid_dp and fleet.worker_index() > 8:
                    continue
                save_path = os.path.join(output_dir, 'step_' + str(steps))
                log.debug("saving models to {}".format(save_path))
                save_persistables(exe, save_path, train_program)

            if steps == args.num_train_steps:
                if args.use_hybrid_dp and fleet.worker_index() > 8:
                    continue
                save_path = os.path.join(output_dir,
                                         'final_step_' + str(steps))
                save_persistables(exe, save_path, train_program)
                log.debug("saving final models to {}".format(save_path))
                log.debug("end of training, total steps: {}".format(steps))
예제 #12
0
    with FD.guard():
        model = ErnieModelForTokenClassification.from_pretrained(
            args.from_pretrained, num_labels=7, name='')

        opt = AdamW(learning_rate=LinearDecay(args.lr, args.warmup_steps,
                                              args.max_steps),
                    parameter_list=model.parameters(),
                    weight_decay=0.01)
        #opt = F.optimizer.AdamOptimizer(learning_rate=LinearDecay(args.lr, args.warmup_steps, args.max_steps), parameter_list=model.parameters())
        for epoch in range(args.epoch):
            for step, (ids, sids, aligned_label, label,
                       orig_pos) in enumerate(tqdm(train_ds.start())):
                loss, _ = model(ids, sids, labels=aligned_label)
                loss.backward()
                if step % 10 == 0:
                    log.debug('train loss %.5f' % loss.numpy())
                opt.minimize(loss)
                model.clear_gradients()
                if step % 100 == 0:
                    all_pred, all_label = [], []
                    with FD.base._switch_tracer_mode_guard_(is_train=False):
                        model.eval()
                        for step, (ids, sids, aligned_label,
                                   label, orig_pos) in enumerate(
                                       tqdm(dev_ds.start())):
                            loss, logits = model(ids,
                                                 sids,
                                                 labels=aligned_label)
                            #print('\n'.join(map(str, logits.numpy().tolist())))

                            for pos, lo, la in zip(orig_pos.numpy(),
예제 #13
0
def train(model, train_dataset, dev_dataset, dev_examples, dev_features,
          tokenizer, args):
    model = P.DataParallel(model)

    max_steps = len(train_features) * args.epoch // args.bsz

    g_clip = P.nn.ClipGradByGlobalNorm(1.0)  #experimental
    lr_scheduler = P.optimizer.lr.LambdaDecay(
        args.lr,
        get_warmup_and_linear_decay(max_steps,
                                    int(args.warmup_proportion * max_steps)))

    opt = P.optimizer.AdamW(lr_scheduler,
                            parameters=model.parameters(),
                            weight_decay=args.wd,
                            grad_clip=g_clip)

    train_dataset = train_dataset \
            .cache_shuffle_shard(env.nranks, env.dev_id, drop_last=True) \
            .padded_batch(args.bsz)

    log.debug('init training with args: %s' % repr(args))
    scaler = P.amp.GradScaler(enable=args.use_amp)
    create_if_not_exists(args.save_dir)

    with P.amp.auto_cast(enable=args.use_amp):
        for step, (_, token_ids, token_type_ids, start_pos,
                   end_pos) in enumerate(
                       P.io.DataLoader(train_dataset,
                                       places=P.CUDAPlace(env.dev_id),
                                       batch_size=None)):
            loss, _, __ = model(token_ids,
                                token_type_ids,
                                start_pos=start_pos,
                                end_pos=end_pos)
            loss = scaler.scale(loss)
            loss.backward()
            scaler.minimize(opt, loss)
            model.clear_gradients()
            lr_scheduler.step()

            if env.dev_id == 0 and step % 10 == 0:
                _lr = lr_scheduler.get_lr()
                if args.use_amp:
                    _l = (loss / scaler._scale).numpy()
                    msg = '[rank-%d][step-%d] train loss %.5f lr %.3e scaling %.3e' % (
                        env.dev_id, step, _l, _lr, scaler._scale.numpy())
                else:
                    _l = loss.numpy()
                    msg = '[rank-%d][step-%d] train loss %.5f lr %.3e' % (
                        env.dev_id, step, _l, _lr)
                log.debug(msg)

            if env.dev_id == 0 and step % 100 == 0:
                f1, em = evaluate(model, dev_dataset, dev_examples,
                                  dev_features, tokenizer, args)
                log.debug('[step %d] eval result: f1 %.5f em %.5f' %
                          (step, f1, em))
            if env.dev_id == 0 and args.save_dir is not None:
                P.save(model.state_dict(), args.save_dir / 'ckpt.bin')
            if step > max_steps:
                break
예제 #14
0
def seq2seq(model, tokenizer, args):
    log.info('Training starts with args: %r' % args)
    attn_id = tokenizer.vocab[args.attn_token]

    def gen_mask(batch_ids, mask_type='bidi', query_len=None, pad_value=0):
        if query_len is None:
            query_len = batch_ids.shape[1]
        if mask_type != 'empty':
            mask = (batch_ids != pad_value).astype(np.float32)
            mask = np.tile(np.expand_dims(mask, 1), [1, query_len, 1])
            if mask_type == 'causal':
                assert query_len == batch_ids.shape[1]
                mask = np.tril(mask)
            elif mask_type == 'causal_without_diag':
                assert query_len == batch_ids.shape[1]
                mask = np.tril(mask, -1)
            elif mask_type == 'diag':
                assert query_len == batch_ids.shape[1]
                mask = np.stack([np.diag(np.diag(m)) for m in mask], 0)
        else:
            mask_type == 'empty'
            mask = np.zeros_like(batch_ids).astype(np.float32)
            mask = np.tile(np.expand_dims(mask, 1), [1, query_len, 1])
        return mask

    def make_some_noice(ids):
        if args.use_random_noice:
            noice_ids = np.random.randint(
                1, len(tokenizer.vocab), size=ids.shape)
        else:
            noice_ids = np.ones_like(ids) * tokenizer.vocab['[NOISE]']
        pos, = np.where(np.ones_like(ids))
        np.random.shuffle(pos)
        pos = pos[:int(args.noise_prob * len(pos))]
        ids[pos, ] = noice_ids[pos, ]
        return ids

    def map_fn(example_id, src_ids, tgt_ids):
        src_ids = src_ids[:args.max_encode_len]
        tgt_ids = tgt_ids[:args.max_decode_len]
        src_ids, src_sids = tokenizer.build_for_ernie(src_ids)
        src_pids = np.arange(len(src_ids))

        tgt_ids, tgt_sids = tokenizer.build_for_ernie(tgt_ids)
        tgt_pids = np.arange(len(tgt_ids)) + len(src_ids)  # continues position
        tgt_sids = np.ones_like(tgt_sids) * args.tgt_type_id

        attn_ids = np.ones_like(tgt_ids) * attn_id
        if args.noise_prob > 0.:
            tgt_labels = deepcopy(tgt_ids)
            tgt_ids = make_some_noice(tgt_ids)  #corrupted
        else:
            tgt_labels = tgt_ids

        return (example_id, src_ids, src_pids, src_sids, tgt_ids, tgt_pids,
                tgt_sids, attn_ids, tgt_labels)

    def after_padding(example_id, src_ids, src_pids, src_sids, tgt_ids,
                      tgt_pids, tgt_sids, attn_ids, tgt_labels):
        '''
        attention mask:
        ***  src,  tgt, attn
        src  00,   01,   11
        tgt  10,   11,   12
        attn 20,   21,   22

        ***   s1, s2 | t1 t2 t3| attn1 attn2 attn3
        s1    1,  1  | 0, 0, 0,| 0,    0,    0,
        s2    1,  1  | 0, 0, 0,| 0,    0,    0,
        -
        t1    1,  1, | 1, 0, 0,| 0,    0,    0,
        t2    1,  1, | 1, 1, 0,| 0,    0,    0,
        t3    1,  1, | 1, 1, 1,| 0,    0,    0,
        -
        attn1 1,  1, | 0, 0, 0,| 1,    0,    0,
        attn2 1,  1, | 1, 0, 0,| 0,    1,    0,
        attn3 1,  1, | 1, 1, 0,| 0,    0,    1,

        for details, see Fig3. https://arxiv.org/abs/2001.11314
        '''

        src_len = src_ids.shape[1]
        tgt_len = tgt_ids.shape[1]
        mask_00 = gen_mask(src_ids, 'bidi', query_len=src_len)
        mask_01 = gen_mask(tgt_ids, 'empty', query_len=src_len)
        mask_02 = gen_mask(attn_ids, 'empty', query_len=src_len)

        mask_10 = gen_mask(src_ids, 'bidi', query_len=tgt_len)
        mask_11 = gen_mask(tgt_ids, 'causal', query_len=tgt_len)
        mask_12 = gen_mask(attn_ids, 'empty', query_len=tgt_len)

        mask_20 = gen_mask(src_ids, 'bidi', query_len=tgt_len)
        mask_21 = gen_mask(tgt_ids, 'causal_without_diag', query_len=tgt_len)
        mask_22 = gen_mask(attn_ids, 'diag', query_len=tgt_len)
        '''
        mask = np.concatenate([
            np.concatenate([mask_00, mask_01, mask_02], 2),
            np.concatenate([mask_10, mask_11, mask_12], 2),
            np.concatenate([mask_20, mask_21, mask_22], 2),
        ], 1)

        ids = np.concatenate([src_ids, tgt_ids, attn_ids], 1)
        pids = np.concatenate([src_pids, tgt_pids, tgt_pids], 1)
        sids = np.concatenate([src_sids, tgt_sids, tgt_sids], 1)

        '''

        mask_src_2_src = mask_00
        mask_tgt_2_srctgt = np.concatenate([mask_10, mask_11], 2)
        mask_attn_2_srctgtattn = np.concatenate([mask_20, mask_21, mask_22], 2)

        tgt_labels = tgt_labels[np.where(tgt_labels != 0)]
        return (example_id, src_ids, src_sids, src_pids, tgt_ids, tgt_sids,
                tgt_pids, attn_ids, mask_src_2_src, mask_tgt_2_srctgt,
                mask_attn_2_srctgtattn, tgt_labels)

    bytes_vocab = {k.encode('utf8'): v for k, v in tokenizer.vocab.items()}
    feature_column = propeller.data.FeatureColumns([
        propeller.data.LabelColumn('id'),
        propeller.data.TextColumn(
            'src', unk_id=tokenizer.unk_id, vocab_dict=bytes_vocab),
        propeller.data.TextColumn(
            'tgt', unk_id=tokenizer.unk_id, vocab_dict=bytes_vocab),
    ])

    train_ds = feature_column.build_dataset('train', data_dir=os.path.join(args.data_dir, 'train'), shuffle=True, repeat=True, use_gz=False) \
                                   .map(map_fn) \
                                   .padded_batch(args.bsz) \
                                   .map(after_padding)


    dev_ds = feature_column.build_dataset('dev', data_dir=os.path.join(args.data_dir, 'dev'), shuffle=False, repeat=False, use_gz=False) \
                                   .map(map_fn) \
                                   .padded_batch(args.eval_bsz) \
                                   .map(after_padding) \
                                   .shard(env.nranks, env.dev_id)

    vocab_size, _ = model.word_emb.weight.shape
    model = P.DataParallel(model)
    g_clip = P.nn.ClipGradByGlobalNorm(1.0)
    param_name_to_exclue_from_weight_decay = re.compile(
        r'.*layer_norm_scale|.*layer_norm_bias|.*b_0')
    lr_scheduler = P.optimizer.lr.LambdaDecay(
        args.lr,
        get_warmup_and_linear_decay(
            args.max_steps, int(args.warmup_proportion * args.max_steps)))

    opt = P.optimizer.AdamW(
        learning_rate=lr_scheduler,
        parameters=model.parameters(),
        weight_decay=args.wd,
        apply_decay_param_fun=lambda n: param_name_to_exclue_from_weight_decay.match(n),
        grad_clip=g_clip)

    scaler = P.amp.GradScaler(enable=args.use_amp)
    attn_id = tokenizer.vocab[args.attn_token]
    create_if_not_exists(args.save_dir)
    if args.predict_output_dir:
        create_if_not_exists(args.predict_output_dir)

    with P.amp.auto_cast(enable=args.use_amp):
        for step, data in enumerate(
                P.io.DataLoader(
                    train_ds, places=P.CUDAPlace(env.dev_id),
                    batch_size=None)):
            (example_id, src_ids, src_sids, src_pids, tgt_ids, tgt_sids,
             tgt_pids, attn_ids, mask_src_2_src, mask_tgt_2_srctgt,
             mask_attn_2_srctgtattn, tgt_labels) = data

            _, __, info = model(
                src_ids,
                sent_ids=src_sids,
                pos_ids=src_pids,
                attn_bias=mask_src_2_src,
                encode_only=True)
            cached_k, cached_v = info['caches']
            _, __, info = model(
                tgt_ids,
                sent_ids=tgt_sids,
                pos_ids=tgt_pids,
                attn_bias=mask_tgt_2_srctgt,
                past_cache=(cached_k, cached_v),
                encode_only=True)
            cached_k2, cached_v2 = info['caches']
            past_cache_k = [
                P.concat([k, k2], 1) for k, k2 in zip(cached_k, cached_k2)
            ]
            past_cache_v = [
                P.concat([v, v2], 1) for v, v2 in zip(cached_v, cached_v2)
            ]
            tgt_labels = F.one_hot(tgt_labels, vocab_size)
            if args.label_smooth > 0.:
                tgt_labels = F.label_smooth(
                    tgt_labels, epsilon=args.label_smooth)
            loss, _, __ = model(
                attn_ids,
                sent_ids=tgt_sids,
                pos_ids=tgt_pids,
                attn_bias=mask_attn_2_srctgtattn,
                past_cache=(past_cache_k, past_cache_v),
                tgt_labels=tgt_labels,
                tgt_pos=P.nonzero(attn_ids == attn_id))

            loss = scaler.scale(loss)
            loss.backward()
            scaler.minimize(opt, loss)
            model.clear_gradients()
            lr_scheduler.step()

            if step % 10 == 0:
                _lr = lr_scheduler.get_lr()
                if args.use_amp:
                    _l = (loss / scaler._scale).numpy()
                    msg = '[rank-%d][step-%d] train loss %.5f lr %.3e scaling %.3e' % (
                        env.dev_id, step, _l, _lr, scaler._scale.numpy())
                else:
                    _l = loss.numpy()
                    msg = '[rank-%d][step-%d] train loss %.5f lr %.3e' % (
                        env.dev_id, step, _l, _lr)
                log.debug(msg)

            if args.save_dir is not None and step % 1000 == 0 and env.dev_id == 0:
                P.save(model.state_dict(), args.save_dir / 'ckpt.bin')

            if args.predict_output_dir is not None and step > args.skip_eval_steps and step % args.eval_steps == 0:
                assert  args.predict_output_dir.exists(), \
                 'predict_output_dir not found: %s' % args.predict_output_dir
                log.debug('doing predict on gpu %d...' % env.dev_id)
                evaluate(model, dev_ds, step, args)
            if step > args.max_steps:
                break
        evaluate(model, dev_ds, step, args)

    if args.save_dir is not None:
        P.save(model.state_dict(), args.save_dir / 'ckpt.bin')
                                       b"2": 2,
                                   }),
    ])

    def map_fn(seg_a, seg_b, label):
        seg_a, seg_b = tokenizer.truncate(seg_a, seg_b, seqlen=args.max_seqlen)
        sentence, segments = tokenizer.build_for_ernie(seg_a, seg_b)
        return sentence, segments, label


    train_ds = feature_column.build_dataset('train', data_dir=os.path.join(args.data_dir, 'train'), shuffle=False, repeat=True, use_gz=False) \
                                   .map(map_fn) \
                                   .padded_batch(args.bsz, (0, 0, 0))
    train_ds = train_ds.shard(propeller.train.distribution.status.num_replica,
                              propeller.train.distribution.status.replica_id)
    log.debug('shard %d/%d' % (propeller.train.distribution.status.num_replica,
                               propeller.train.distribution.status.replica_id))
    train_ds = train_ds.shuffle(10000)

    dev_ds = feature_column.build_dataset('dev', data_dir=os.path.join(args.data_dir, 'dev'), shuffle=False, repeat=False, use_gz=False) \
                                   .map(map_fn) \
                                   .padded_batch(args.bsz, (0, 0, 0))

    shapes = ([-1, args.max_seqlen], [-1, args.max_seqlen], [-1])
    types = ('int64', 'int64', 'int64')

    train_ds.data_shapes = shapes
    train_ds.data_types = types
    dev_ds.data_shapes = shapes
    dev_ds.data_types = types

    place = F.CUDAPlace(FD.parallel.Env().dev_id)
예제 #16
0
    place = F.CUDAPlace(D.parallel.Env().dev_id)
    with D.guard(place):
        model = ErnieModelForPretraining.from_pretrained(args.from_pretrained)
        opt = AdamW(learning_rate=LinearDecay(args.lr, args.warmup_steps,
                                              args.max_steps),
                    parameter_list=model.parameters(),
                    weight_decay=0.01)

        ctx = D.parallel.prepare_context()
        model = D.parallel.DataParallel(model, ctx)

        for step, samples in enumerate(tqdm(train_ds.start(place))):
            (src_ids, sent_ids, mlm_label, mask_pos, nsp_label) = samples
            loss, mlmloss, nsploss = model(src_ids,
                                           sent_ids,
                                           labels=mlm_label,
                                           mlm_pos=mask_pos,
                                           nsp_labels=nsp_label)
            scaled_loss = model.scale_loss(loss)
            scaled_loss.backward()
            model.apply_collective_grads()
            opt.minimize(scaled_loss)
            model.clear_gradients()
            if step % 10 == 0:
                log.debug('train loss %.5f scaled loss %.5f' %
                          (loss.numpy(), scaled_loss.numpy()))
            if step % 10000 == 0 and D.parallel.Env(
            ).dev_id == 0 and args.save_dir is not None:
                F.save_dygraph(model.state_dict(), args.save_dir)
예제 #17
0
        ])

        def before(seg_a, label):
            sentence, segments = utils.data.build_1_pair(
                seg_a,
                max_seqlen=args.max_seqlen,
                cls_id=cls_id,
                sep_id=sep_id)
            return sentence, segments, label

        def after(sentence, segments, label):
            sentence, segments, label = utils.data.expand_dims(
                sentence, segments, label)
            return sentence, segments, label

        log.debug(os.path.join(args.data_dir, 'train'))
        train_ds = feature_column.build_dataset('train', data_dir=os.path.join(args.data_dir, 'train'), shuffle=True, repeat=True, use_gz=False) \
                                       .map(before) \
                                       .padded_batch(hparams.batch_size, (0, 0, 0)) \
                                       .map(after)

        dev_ds = feature_column.build_dataset('dev', data_dir=os.path.join(args.data_dir, 'dev'), shuffle=False, repeat=False, use_gz=False) \
                                       .map(before) \
                                       .padded_batch(hparams.batch_size, (0, 0, 0)) \
                                       .map(after)

        shapes = ([-1, args.max_seqlen, 1], [-1, args.max_seqlen, 1], [-1, 1])
        types = ('int64', 'int64', 'int64')

        train_ds.data_shapes = shapes
        train_ds.data_types = types
예제 #18
0
            sd, _ = FD.load_dygraph(args.init_checkpoint)
            model.set_dict(sd)

        g_clip = F.clip.GradientClipByGlobalNorm(1.0) #experimental
        if args.use_lr_decay:
            opt = AdamW(learning_rate=LinearDecay(args.lr, int(args.warmup_proportion * args.max_steps), args.max_steps), parameter_list=model.parameters(), weight_decay=args.wd, grad_clip=g_clip)
        else:
            opt = AdamW(args.lr, parameter_list=model.parameters(), weight_decay=args.wd, grad_clip=g_clip)

        for epoch in range(args.epoch):
            for step, d in enumerate(tqdm(train_ds.start(place), desc='training')):
                ids, sids, label = d
                loss, _ = model(ids, sids, labels=label)
                loss.backward()
                if step % 10 == 0:
                    log.debug('train loss %.5f lr %.3e' % (loss.numpy(), opt.current_step_lr()))
                opt.minimize(loss)
                model.clear_gradients()
            with FD.base._switch_tracer_mode_guard_(is_train=False):
                model.eval()
                FP = 0
                TP = 0
                FN = 0
                TN = 0
                for step, d in enumerate(tqdm(dev_ds.start(place), desc='evaluating %d' % epoch)):
                    ids, sids, label = d
                    loss, logits = model(ids, sids, labels=label)
                    #print('\n'.join(map(str, logits.numpy().tolist())))
                    a = L.argmax(logits, -1).numpy()
                    label = label.numpy()
                    length = a.shape[0]
예제 #19
0
파일: pretrain.py 프로젝트: zs960114/ERNIE
                                batch_size=0)):
            (src_ids, sent_ids, mlm_label, mask_pos, nsp_label) = samples
            loss, mlmloss, nsploss = model(src_ids,
                                           sent_ids,
                                           labels=mlm_label,
                                           mlm_pos=mask_pos,
                                           nsp_labels=nsp_label)
            loss = scaler.scale(loss)
            loss.backward()
            scaler.minimize(opt, loss)
            model.clear_gradients()
            lr_scheduler.step()

            if step % 10 == 0:
                _lr = lr_scheduler.get_lr()
                if args.use_amp:
                    _l = (loss / scaler._scale).numpy()
                    msg = '[rank-%d][step-%d] train loss %.5f lr %.3e scaling %.3e' % (
                        env.dev_id, step, _l, _lr, scaler._scale.numpy())
                else:
                    _l = loss.numpy()
                    msg = '[rank-%d][step-%d] train loss %.5f lr %.3e' % (
                        env.dev_id, step, _l, _lr)
                log.debug(msg)
            if step % 1000 == 0 and env.dev_id == 0:
                log.debug('saveing...')
                P.save(model.state_dict(), args.save_dir / 'ckpt.bin')
            if step > args.max_steps:
                break
    log.info('done')
예제 #20
0
def seq2seq(model, tokenizer, args):
    log.info('Training starts with args: %r' % args)
    attn_id = tokenizer.vocab[args.attn_token]

    def gen_mask(batch_ids, mask_type='bidi', query_len=None, pad_value=0):
        if query_len is None:
            query_len = batch_ids.shape[1]
        if mask_type != 'empty':
            mask = (batch_ids != pad_value).astype(np.float32)
            mask = np.tile(np.expand_dims(mask, 1), [1, query_len, 1])
            if mask_type == 'causal':
                assert query_len == batch_ids.shape[1]
                mask = np.tril(mask)
            elif mask_type == 'causal_without_diag':
                assert query_len == batch_ids.shape[1]
                mask = np.tril(mask, -1)
            elif mask_type == 'diag':
                assert query_len == batch_ids.shape[1]
                mask = np.stack([np.diag(np.diag(m)) for m in mask], 0)
        else:
            mask_type == 'empty'
            mask = np.zeros_like(batch_ids).astype(np.float32)
            mask = np.tile(np.expand_dims(mask, 1), [1, query_len, 1])
        return mask

    def make_some_noice(ids):
        if args.use_random_noice:
            noice_ids = np.random.randint(1,
                                          len(tokenizer.vocab),
                                          size=ids.shape)
        else:
            noice_ids = np.ones_like(ids) * tokenizer.vocab['[NOISE]']
        pos, = np.where(np.ones_like(ids))
        np.random.shuffle(pos)
        pos = pos[:int(args.noise_prob * len(pos))]
        ids[pos, ] = noice_ids[pos, ]
        return ids

    def map_fn(example_id, src_ids, tgt_ids):
        src_ids = src_ids[:args.max_encode_len]
        tgt_ids = tgt_ids[:args.max_decode_len]
        src_ids, src_sids = tokenizer.build_for_ernie(src_ids)
        src_pids = np.arange(len(src_ids))

        tgt_ids, tgt_sids = tokenizer.build_for_ernie(tgt_ids)
        tgt_pids = np.arange(len(tgt_ids)) + len(src_ids)  # continues position
        tgt_sids = np.ones_like(tgt_sids) * args.tgt_type_id

        attn_ids = np.ones_like(tgt_ids) * attn_id
        if args.noise_prob > 0.:
            tgt_labels = deepcopy(tgt_ids)
            tgt_ids = make_some_noice(tgt_ids)  #corrupted
        else:
            tgt_labels = tgt_ids

        return (example_id, src_ids, src_pids, src_sids, tgt_ids, tgt_pids,
                tgt_sids, attn_ids, tgt_labels)

    def after_padding(example_id, src_ids, src_pids, src_sids, tgt_ids,
                      tgt_pids, tgt_sids, attn_ids, tgt_labels):
        '''
        attention mask:
        ***  src,  tgt, attn
        src  00,   01,   11
        tgt  10,   11,   12
        attn 20,   21,   22

        ***   s1, s2 | t1 t2 t3| attn1 attn2 attn3
        s1    1,  1  | 0, 0, 0,| 0,    0,    0,
        s2    1,  1  | 0, 0, 0,| 0,    0,    0,
        -
        t1    1,  1, | 1, 0, 0,| 0,    0,    0,
        t2    1,  1, | 1, 1, 0,| 0,    0,    0,
        t3    1,  1, | 1, 1, 1,| 0,    0,    0,
        -
        attn1 1,  1, | 0, 0, 0,| 1,    0,    0,
        attn2 1,  1, | 1, 0, 0,| 0,    1,    0,
        attn3 1,  1, | 1, 1, 0,| 0,    0,    1,

        for details, see Fig3. https://arxiv.org/abs/2001.11314
        '''

        src_len = src_ids.shape[1]
        tgt_len = tgt_ids.shape[1]
        mask_00 = gen_mask(src_ids, 'bidi', query_len=src_len)
        mask_01 = gen_mask(tgt_ids, 'empty', query_len=src_len)
        mask_02 = gen_mask(attn_ids, 'empty', query_len=src_len)

        mask_10 = gen_mask(src_ids, 'bidi', query_len=tgt_len)
        mask_11 = gen_mask(tgt_ids, 'causal', query_len=tgt_len)
        mask_12 = gen_mask(attn_ids, 'empty', query_len=tgt_len)

        mask_20 = gen_mask(src_ids, 'bidi', query_len=tgt_len)
        mask_21 = gen_mask(tgt_ids, 'causal_without_diag', query_len=tgt_len)
        mask_22 = gen_mask(attn_ids, 'diag', query_len=tgt_len)
        '''
        mask = np.concatenate([
            np.concatenate([mask_00, mask_01, mask_02], 2),
            np.concatenate([mask_10, mask_11, mask_12], 2),
            np.concatenate([mask_20, mask_21, mask_22], 2),
        ], 1)

        ids = np.concatenate([src_ids, tgt_ids, attn_ids], 1)
        pids = np.concatenate([src_pids, tgt_pids, tgt_pids], 1)
        sids = np.concatenate([src_sids, tgt_sids, tgt_sids], 1)

        '''

        mask_src_2_src = mask_00
        mask_tgt_2_srctgt = np.concatenate([mask_10, mask_11], 2)
        mask_attn_2_srctgtattn = np.concatenate([mask_20, mask_21, mask_22], 2)

        tgt_labels = tgt_labels[np.where(tgt_labels != 0)]
        return (example_id, src_ids, src_sids, src_pids, tgt_ids, tgt_sids,
                tgt_pids, attn_ids, mask_src_2_src, mask_tgt_2_srctgt,
                mask_attn_2_srctgtattn, tgt_labels)

    bytes_vocab = {k.encode('utf8'): v for k, v in tokenizer.vocab.items()}
    feature_column = propeller.data.FeatureColumns([
        propeller.data.LabelColumn('id'),
        propeller.data.TextColumn('src',
                                  unk_id=tokenizer.unk_id,
                                  vocab_dict=bytes_vocab),
        propeller.data.TextColumn('tgt',
                                  unk_id=tokenizer.unk_id,
                                  vocab_dict=bytes_vocab),
    ])

    train_ds = feature_column.build_dataset('train', data_dir=os.path.join(args.data_dir, 'train'), shuffle=False, repeat=True, use_gz=False) \
                                   .map(map_fn)

    dev_ds = feature_column.build_dataset('dev', data_dir=os.path.join(args.data_dir, 'dev'), shuffle=False, repeat=False, use_gz=False) \
                                   .map(map_fn) \
                                   .padded_batch(args.eval_bsz) \
                                   .map(after_padding)

    log.debug('shard %d of %d' %
              (D.parallel.Env().dev_id, D.parallel.Env().nranks))
    train_ds = train_ds.shard(
        D.parallel.Env().nranks,
        D.parallel.Env().dev_id).shuffle(10000).padded_batch(
            args.bsz).map(after_padding)
    dev_ds = dev_ds.shard(D.parallel.Env().nranks, D.parallel.Env().dev_id)

    shapes = [[None, None]] * 7 + [[None, None, None]] * 3 + [[None]]
    types = ['int64'] * 11

    train_ds.data_shapes = shapes
    train_ds.data_types = types
    dev_ds.data_shapes = shapes
    dev_ds.data_types = types

    vocab_size, _ = model.word_emb.weight.shape
    ctx = D.parallel.prepare_context()
    model = D.parallel.DataParallel(model, ctx)
    g_clip = F.clip.GradientClipByGlobalNorm(1.0)
    opt = AdamW(learning_rate=LinearDecay(
        args.lr, int(args.warmup_proportion * args.max_steps), args.max_steps),
                parameter_list=model.parameters(),
                weight_decay=args.wd,
                grad_clip=g_clip)
    attn_id = tokenizer.vocab[args.attn_token]
    for step, data in enumerate(train_ds.start(place)):
        (example_id, src_ids, src_sids, src_pids, tgt_ids, tgt_sids, tgt_pids,
         attn_ids, mask_src_2_src, mask_tgt_2_srctgt, mask_attn_2_srctgtattn,
         tgt_labels) = data

        _, __, info = model(src_ids,
                            sent_ids=src_sids,
                            pos_ids=src_pids,
                            attn_bias=mask_src_2_src,
                            encode_only=True)
        cached_k, cached_v = info['caches']
        _, __, info = model(tgt_ids,
                            sent_ids=tgt_sids,
                            pos_ids=tgt_pids,
                            attn_bias=mask_tgt_2_srctgt,
                            past_cache=(cached_k, cached_v),
                            encode_only=True)
        cached_k2, cached_v2 = info['caches']
        past_cache_k = [
            L.concat([k, k2], 1) for k, k2 in zip(cached_k, cached_k2)
        ]
        past_cache_v = [
            L.concat([v, v2], 1) for v, v2 in zip(cached_v, cached_v2)
        ]
        if args.label_smooth > 0.:
            tgt_labels = L.label_smooth(F.one_hot(tgt_labels, vocab_size),
                                        epsilon=args.label_smooth)
        loss, _, __ = model(attn_ids,
                            sent_ids=tgt_sids,
                            pos_ids=tgt_pids,
                            attn_bias=mask_attn_2_srctgtattn,
                            past_cache=(past_cache_k, past_cache_v),
                            tgt_labels=tgt_labels,
                            tgt_pos=L.where(attn_ids == attn_id))

        scaled_loss = model.scale_loss(loss)
        scaled_loss.backward()
        model.apply_collective_grads()
        opt.minimize(scaled_loss)
        model.clear_gradients()
        if step % 10 == 0:
            loss = loss.numpy()
            ppl = np.exp(loss)
            log.debug('[step %d]train loss %.5f, ppl %.5f, lr %.3e' %
                      (step, loss, ppl, opt.current_step_lr()))
        if args.save_dir is not None and step % 1000 == 0 and D.parallel.Env(
        ).dev_id == 0:
            F.save_dygraph(model.state_dict(), args.save_dir)
        if args.predict_output_dir is not None and step > args.skip_eval_steps and step % args.eval_steps == 0:
            assert os.path.exists(
                args.predict_output_dir
            ), 'predict_output_dir not found: %s' % args.predict_output_dir
            log.debug('doing predict on gpu %d...' % D.parallel.Env().dev_id)
            evaluate(model, dev_ds, step, args)
        if step > args.max_steps:
            break
    evaluate(model, dev_ds, step, args)

    if args.save_dir is not None:
        F.save_dygraph(model.state_dict(), args.save_dir)
예제 #21
0
            args.lr, int(args.warmup_proportion * args.max_steps),
            args.max_steps),
                    parameter_list=model.parameters(),
                    weight_decay=args.wd,
                    grad_clip=g_clip)
        #opt = F.optimizer.AdamOptimizer(learning_rate=LinearDecay(args.lr, args.warmup_steps, args.max_steps), parameter_list=model.parameters())
        for epoch in range(args.epoch):
            for step, (ids, sids, aligned_label, label,
                       orig_pos) in enumerate(tqdm(train_ds.start(place))):
                loss, logits = model(
                    ids,
                    sids,
                    labels=aligned_label,
                    loss_weights=L.cast(
                        ids > tokenizer.mask_id,
                        'float32'))  # [MASK] is the largest special token
                loss.backward()
                if step % 10 == 0:
                    log.debug('train loss %.5f, lr %.3e' %
                              (loss.numpy(), opt.current_step_lr()))
                opt.minimize(loss)
                model.clear_gradients()
                if step % 100 == 0:
                    f1 = evaluate(model, dev_ds)
                    log.debug('eval f1: %.5f' % f1)

        f1 = evaluate(model, dev_ds)
        log.debug('final eval f1: %.5f' % f1)
        if args.save_dir is not None:
            F.save_dygraph(model.state_dict(), args.save_dir)
예제 #22
0
                step += 1
                scaler.minimize(opt, loss)
                model.clear_gradients()
                lr_scheduler and lr_scheduler.step()

                if step % 10 == 0:
                    _lr = lr_scheduler.get_lr()
                    if args.use_amp:
                        _l = (loss / scaler._scale).numpy()
                        msg = '[step-%d] train loss %.5f lr %.3e scaling %.3e' % (
                            step, _l, _lr, scaler._scale.numpy())
                    else:
                        _l = loss.numpy()
                        msg = '[step-%d] train loss %.5f lr %.3e' % (step, _l,
                                                                     _lr)
                    log.debug(msg)
                    log_writer.add_scalar('loss', _l, step=step)
                    log_writer.add_scalar('lr', _lr, step=step)
                if step % 100 == 0:
                    acc = []
                    with P.no_grad():
                        model.eval()
                        for ids, sids, label in P.io.DataLoader(
                                dev_ds, places=P.CUDAPlace(0),
                                batch_size=None):
                            loss, logits = model(ids, sids, labels=label)
                            #print('\n'.join(map(str, logits.numpy().tolist())))
                            a = (logits.argmax(-1) == label)
                            acc.append(a.numpy())
                        model.train()
                    acc = np.concatenate(acc).mean()
예제 #23
0
                loss.backward()
                scaler.minimize(opt, loss)
                model.clear_gradients()
                lr_scheduler.step()

                if step % 10 == 0:
                    _lr = lr_scheduler.get_lr()
                    if args.use_amp:
                        _l = (loss / scaler._scale).numpy()
                        msg = '[step-%d] train loss %.5f lr %.3e scaling %.3e' % (
                            step, _l, _lr, scaler._scale.numpy())
                    else:
                        _l = loss.numpy()
                        msg = '[step-%d] train loss %.5f lr %.3e' % (step, _l,
                                                                     _lr)
                    log.debug(msg)
                    log_writer.add_scalar('loss', _l, step=step)
                    log_writer.add_scalar('lr', _lr, step=step)

                if step % 100 == 0:
                    f1 = evaluate(model, dev_ds)
                    log.debug('eval f1: %.5f' % f1)
                    log_writer.add_scalar('eval/f1', f1, step=step)
                    if args.save_dir is not None:
                        P.save(model.state_dict(), args.save_dir / 'ckpt.bin')

f1 = evaluate(model, dev_ds)
log.debug('final eval f1: %.5f' % f1)
log_writer.add_scalar('eval/f1', f1, step=step)
if args.save_dir is not None:
    P.save(model.state_dict(), args.save_dir / 'ckpt.bin')
            g_clip = F.clip.GradientClipByGlobalNorm(1.0)  #experimental
            opt = AdamW(learning_rate=LinearDecay(
                args.lr, int(args.warmup_proportion * args.max_steps),
                args.max_steps),
                        parameter_list=model.parameters(),
                        weight_decay=args.wd,
                        grad_clip=g_clip)

            for epoch in range(args.epoch):
                for step, d in enumerate(
                        tqdm(train_ds.start(place), desc='training')):
                    ids, sids, label = d
                    loss, _ = model(ids, sids, labels=label)
                    loss.backward()
                    if step % 10 == 0:
                        log.debug('train loss %.5f lr %.3e' %
                                  (loss.numpy(), opt.current_step_lr()))
                    opt.minimize(loss)
                    model.clear_gradients()
                    if step % 100 == 0:
                        acc = []
                        with FD.base._switch_tracer_mode_guard_(
                                is_train=False):
                            model.eval()
                            for step, d in enumerate(
                                    tqdm(dev_ds.start(place),
                                         desc='evaluating %d' % epoch)):
                                ids, sids, label = d
                                loss, logits = model(ids, sids, labels=label)
                                #print('\n'.join(map(str, logits.numpy().tolist())))
                                a = L.argmax(logits, -1) == label
                                acc.append(a.numpy())
예제 #25
0
        if end_position is None:
            end_position = 0
        return np.array(unique_id), np.array(token_ids), np.array(text_type_ids), np.array(start_position), np.array(end_position)

    train_dataset = propeller.data.Dataset.from_list(train_features).map(map_fn)

    dev_dataset = propeller.data.Dataset.from_list(dev_features).map(map_fn).padded_batch(args.bsz)
    shapes = ([-1], [-1, args.max_seqlen], [-1, args.max_seqlen], [-1], [-1])
    types = ('int64', 'int64', 'int64', 'int64', 'int64')

    train_dataset.name = 'train'
    dev_dataset.name = 'dev'

    train_dataset.data_shapes = shapes
    train_dataset.data_types = types
    dev_dataset.data_shapes = shapes
    dev_dataset.data_types = types

    place = F.CUDAPlace(D.parallel.Env().dev_id)
    D.guard(place).__enter__()
    model = ErnieModelForQuestionAnswering.from_pretrained(args.from_pretrained, name='')

    train(model, train_dataset, dev_dataset, dev_examples, dev_features, tokenizer, args)

    if D.parallel.Env().dev_id == 0:
        f1, em = evaluate(model, dev_dataset, dev_examples, dev_features, tokenizer, args)
        log.debug('final eval result: f1 %.5f em %.5f' % (f1, em))
    if D.parallel.Env().dev_id == 0 and args.save_dir is not None:
        F.save_dygraph(model.state_dict(), args.save_dir)