Exemplo n.º 1
0
 def __init__(self, args):
     self.args = args
     self.ext_predictor, self.ext_input_handles, self.ext_output_hanle = self.create_predictor(
         args.ext_model_path)
     print(f"ext_model_path: {args.ext_model_path}, {self.ext_predictor}")
     self.cls_predictor, self.cls_input_handles, self.cls_output_hanle = self.create_predictor(
         args.cls_model_path)
     self.ext_label2id, self.ext_id2label = load_dict(args.ext_label_path)
     self.cls_label2id, self.cls_id2label = load_dict(args.cls_label_path)
     self.tokenizer = SkepTokenizer.from_pretrained(args.base_model_name)
Exemplo n.º 2
0
def predict_cls(args, ext_results):
    # load dict
    model_name = "skep_ernie_1.0_large_ch"
    cls_label2id, cls_id2label = load_dict(args.cls_label_path)

    tokenizer = SkepTokenizer.from_pretrained(model_name)
    test_ds = MapDataset(ext_results)
    trans_func = partial(convert_example_to_feature_cls,
                         tokenizer=tokenizer,
                         label2id=cls_label2id,
                         max_seq_len=args.cls_max_seq_len,
                         is_test=True)
    test_ds = test_ds.map(trans_func, lazy=False)

    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=tokenizer.pad_token_id),
        Pad(axis=0, pad_val=tokenizer.pad_token_type_id), Stack(dtype="int64")
    ): fn(samples)

    # set shuffle is False
    test_batch_sampler = paddle.io.BatchSampler(test_ds,
                                                batch_size=args.batch_size,
                                                shuffle=False)
    test_loader = paddle.io.DataLoader(test_ds,
                                       batch_sampler=test_batch_sampler,
                                       collate_fn=batchify_fn)
    print("test data loaded.")

    # load cls model
    cls_state_dict = paddle.load(args.cls_model_path)
    cls_model = SkepForSequenceClassification.from_pretrained(
        model_name, num_classes=len(cls_label2id))
    cls_model.load_dict(cls_state_dict)
    print("classification model loaded.")

    cls_model.eval()

    results = []
    for bid, batch_data in enumerate(test_loader):
        input_ids, token_type_ids, seq_lens = batch_data
        logits = cls_model(input_ids, token_type_ids=token_type_ids)

        predictions = logits.argmax(axis=1).numpy().tolist()
        results.extend(predictions)

    results = [cls_id2label[pred_id] for pred_id in results]
    return results
Exemplo n.º 3
0

if __name__ == "__main__":
    paddle.set_device(args.device)

    # These data samples is in Chinese.
    # If you use the english model, you should change the test data in English.
    data = [
        '这个宾馆比较陈旧了,特价的房间也很一般。总体来说一般',
        '怀着十分激动的心情放映,可是看着看着发现,在放映完毕后,出现一集米老鼠的动画片',
        '作为老的四星酒店,房间依然很整洁,相当不错。机场接机服务很好,可以在车上办理入住手续,节省时间。',
    ]
    label_map = {0: 'negative', 1: 'positive'}

    model = SkepForSequenceClassification.from_pretrained(
        args.model_name, num_classes=len(label_map))
    tokenizer = SkepTokenizer.from_pretrained(args.model_name)

    if args.params_path and os.path.isfile(args.params_path):
        state_dict = paddle.load(args.params_path)
        model.set_dict(state_dict)
        print("Loaded parameters from %s" % args.params_path)

    results = predict(model,
                      data,
                      tokenizer,
                      label_map,
                      batch_size=args.batch_size)
    for idx, text in enumerate(data):
        print('Data: {} \t Label: {}'.format(text, results[idx]))
Exemplo n.º 4
0
    paddle.set_device(args.device)
    rank = paddle.distributed.get_rank()
    if paddle.distributed.get_world_size() > 1:
        paddle.distributed.init_parallel_env()

    train_ds = load_dataset("cote", "dp", splits=['train'])
    # The COTE_DP dataset labels with "BIO" schema.
    label_map = {label: idx for idx, label in enumerate(train_ds.label_list)}
    # `no_entity_label` represents that the token isn't an entity. 
    no_entity_label_idx = label_map.get("O", 2)

    set_seed(args.seed)
    skep = SkepModel.from_pretrained('skep_ernie_1.0_large_ch')
    model = SkepCrfForTokenClassification(
        skep, num_classes=len(train_ds.label_list))
    tokenizer = SkepTokenizer.from_pretrained('skep_ernie_1.0_large_ch')

    trans_func = partial(
        convert_example_to_feature,
        tokenizer=tokenizer,
        max_seq_len=args.max_seq_length,
        no_entity_label=no_entity_label_idx,
        is_test=False)
    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token]),  # input ids
        Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token]),  # token type ids
        Stack(dtype='int64'),  # sequence lens
        Pad(axis=0, pad_val=no_entity_label_idx)  # labels
    ): [data for data in fn(samples)]

    train_data_loader = create_dataloader(
Exemplo n.º 5
0
def predict_ext(args):
    # load dict
    model_name = "skep_ernie_1.0_large_ch"
    ext_label2id, ext_id2label = load_dict(args.ext_label_path)

    tokenizer = SkepTokenizer.from_pretrained(model_name)
    ori_test_ds = load_dataset(read_test_file,
                               data_path=args.test_path,
                               lazy=False)
    trans_func = partial(convert_example_to_feature_ext,
                         tokenizer=tokenizer,
                         label2id=ext_label2id,
                         max_seq_len=args.ext_max_seq_len,
                         is_test=True)
    test_ds = copy.copy(ori_test_ds).map(trans_func, lazy=False)

    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64"),
        Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype="int64"),
        Stack(dtype="int64"),
    ): fn(samples)

    test_batch_sampler = paddle.io.BatchSampler(test_ds,
                                                batch_size=args.batch_size,
                                                shuffle=False)
    test_loader = paddle.io.DataLoader(test_ds,
                                       batch_sampler=test_batch_sampler,
                                       collate_fn=batchify_fn)
    print("test data loaded.")

    # load ext model
    ext_state_dict = paddle.load(args.ext_model_path)
    ext_model = SkepForTokenClassification.from_pretrained(
        model_name, num_classes=len(ext_label2id))
    ext_model.load_dict(ext_state_dict)
    print("extraction model loaded.")

    ext_model.eval()
    results = []
    for bid, batch_data in enumerate(test_loader):
        input_ids, token_type_ids, seq_lens = batch_data
        logits = ext_model(input_ids, token_type_ids=token_type_ids)

        predictions = logits.argmax(axis=2).numpy()
        for eid, (seq_len, prediction) in enumerate(zip(seq_lens,
                                                        predictions)):
            idx = bid * args.batch_size + eid
            tag_seq = [ext_id2label[idx] for idx in prediction[:seq_len][1:-1]]
            text = ori_test_ds[idx]["text"]
            aps = decoding(text[:args.ext_max_seq_len - 2], tag_seq)
            for aid, ap in enumerate(aps):
                aspect, opinions = ap[0], list(set(ap[1:]))
                aspect_text = concate_aspect_and_opinion(
                    text, aspect, opinions)
                results.append({
                    "id": str(idx) + "_" + str(aid),
                    "aspect": aspect,
                    "opinions": opinions,
                    "text": text,
                    "aspect_text": aspect_text
                })

    return results
Exemplo n.º 6
0
def train():
    # set running envir
    model_name = "skep_ernie_1.0_large_ch"

    paddle.set_device(args.device)
    set_seed(args.seed)

    if not os.path.exists(args.checkpoints):
        os.mkdir(args.checkpoints)

    # load and process data
    label2id, id2label = load_dict(args.label_path)
    train_ds = load_dataset(read, data_path=args.train_path, lazy=False)
    dev_ds = load_dataset(read, data_path=args.dev_path, lazy=False)

    tokenizer = SkepTokenizer.from_pretrained(model_name)
    trans_func = partial(
        convert_example_to_feature,
        tokenizer=tokenizer,
        label2id=label2id,
        max_seq_len=args.max_seq_len)
    train_ds = train_ds.map(trans_func, lazy=False)
    dev_ds = dev_ds.map(trans_func, lazy=False)

    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64"),
        Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype="int64"),
        Stack(dtype="int64"),
        Pad(axis=0, pad_val= -1, dtype="int64")
    ): fn(samples)

    train_batch_sampler = paddle.io.BatchSampler(
        train_ds, batch_size=args.batch_size, shuffle=True)
    dev_batch_sampler = paddle.io.BatchSampler(
        dev_ds, batch_size=args.batch_size, shuffle=False)
    train_loader = paddle.io.DataLoader(
        train_ds, batch_sampler=train_batch_sampler, collate_fn=batchify_fn)
    dev_loader = paddle.io.DataLoader(
        dev_ds, batch_sampler=dev_batch_sampler, collate_fn=batchify_fn)

    # configure model training
    model = SkepForTokenClassification.from_pretrained(
        model_name, num_classes=len(label2id))

    num_training_steps = len(train_loader) * args.num_epochs
    lr_scheduler = LinearDecayWithWarmup(
        learning_rate=args.learning_rate,
        total_steps=num_training_steps,
        warmup=args.warmup_proportion)
    decay_params = [
        p.name for n, p in model.named_parameters()
        if not any(nd in n for nd in ["bias", "norm"])
    ]
    grad_clip = paddle.nn.ClipGradByGlobalNorm(args.max_grad_norm)
    optimizer = paddle.optimizer.AdamW(
        learning_rate=lr_scheduler,
        parameters=model.parameters(),
        weight_decay=args.weight_decay,
        apply_decay_param_fun=lambda x: x in decay_params,
        grad_clip=grad_clip)

    metric = ChunkEvaluator(label2id.keys())

    # start to train model
    global_step, best_f1 = 1, 0.
    model.train()
    for epoch in range(1, args.num_epochs + 1):
        for batch_data in train_loader():
            input_ids, token_type_ids, _, labels = batch_data
            # logits: batch_size, seql_len, num_tags
            logits = model(input_ids, token_type_ids=token_type_ids)
            loss = F.cross_entropy(
                logits.reshape([-1, len(label2id)]),
                labels.reshape([-1]),
                ignore_index=-1)

            loss.backward()
            lr_scheduler.step()
            optimizer.step()
            optimizer.clear_grad()

            if global_step > 0 and global_step % args.log_steps == 0:
                print(
                    f"epoch: {epoch} - global_step: {global_step}/{num_training_steps} - loss:{loss.numpy().item():.6f}"
                )
            if (global_step > 0 and global_step % args.eval_steps == 0
                ) or global_step == num_training_steps:
                precision, recall, f1 = evaluate(model, dev_loader, metric)
                model.train()
                if f1 > best_f1:
                    print(
                        f"best F1 performence has been updated: {best_f1:.5f} --> {f1:.5f}"
                    )
                    best_f1 = f1
                    paddle.save(model.state_dict(),
                                f"{args.checkpoints}/best.pdparams")
                print(
                    f'evalution result: precision: {precision:.5f}, recall: {recall:.5f},  F1: {f1:.5f}'
                )

            global_step += 1

    paddle.save(model.state_dict(), f"{args.checkpoints}/final.pdparams")