Exemplo n.º 1
0
    def __init__(self, model_name="wordtag", term_linking=True, tag_path=None):
        """Initialize method of the predictor.

        Args:
            model_name (`str`): 
                The pre-trained model name.
            tag_path (`str`): 
                The tag vocab path.
        """
        term_schema_path = self._download_termtree("termtree_type.csv")
        term_data_path = self._download_termtree("termtree.rawbase")
        if tag_path is None:
            tag_path = self._download_termtree("termtree_tags.txt")
        self._tags_to_index, self._index_to_tags = self._load_labels(tag_path)

        self._model = ErnieCtmWordtagModel.from_pretrained(
            model_name,
            num_cls_label=4,
            num_tag=len(self._tags_to_index),
            ignore_index=self._tags_to_index["O"])
        self._model.eval()

        self._tokenizer = ErnieCtmTokenizer.from_pretrained(model_name)
        self._summary_num = self._model.ernie_ctm.content_summary_index + 1
        if term_schema_path is not None:
            self._term_schema = self._load_schema(term_schema_path)
        if term_data_path is not None:
            self._term_dict = self._load_term_tree_data(term_data_path)
        if term_data_path is not None and term_schema_path is not None and term_linking:
            self._linking = True
        else:
            self._linking = False
Exemplo n.º 2
0
    def __init__(self, model_dir, tag_path, linking_path=None):
        """Initialize method of the predictor.

        Args:
            model_dir: The pre-trained model checkpoint dir.
            tag_path: The tag vocab path.
            linking_path:if you want to use linking mode, you should load link feature using.
        """
        self._tags_to_index, self._index_to_tags = self._load_labels(tag_path)

        self._model = ErnieCtmWordtagModel.from_pretrained(
            model_dir,
            num_cls_label=4,
            num_tag=len(self._tags_to_index),
            ignore_index=self._tags_to_index["O"])
        self._model.eval()

        self._tokenizer = ErnieCtmTokenizer.from_pretrained(model_dir)
        self._summary_num = self._model.ernie_ctm.content_summary_index + 1
        self.linking = False
        if linking_path is not None:
            self.linking_dict = {}
            with open(linking_path, encoding="utf-8") as fp:
                for line in fp:
                    data = json.loads(line)
                    if data["label"] not in self.linking_dict:
                        self.linking_dict[data["label"]] = []
                    self.linking_dict[data["label"]].append({
                        "sid":
                        data["sid"],
                        "cls":
                        paddle.to_tensor(data["cls1"]).unsqueeze(0),
                        "term":
                        paddle.to_tensor(data["term"]).unsqueeze(0)
                    })
            self.linking = True
            self.sim_fct = nn.CosineSimilarity(dim=1)
Exemplo n.º 3
0
    def __init__(self,
                 model_dir,
                 tag_path,
                 term_schema_path=None,
                 term_data_path=None):
        """Initialize method of the predictor.

        Args:
            model_dir (`str`): 
                The pre-trained model checkpoint dir.
            tag_path (`str`): 
                The tag vocab path.
            term_schema_path (`str`, optional): 
                if you want to use linking mode, you should load term schema. Defaults to ``None``.
            term_data_path (`str`, optional):
                if you want to use linking mode, you should load term data. Defaults to ``None``.
        """
        self._tags_to_index, self._index_to_tags = self._load_labels(tag_path)

        self._model = ErnieCtmWordtagModel.from_pretrained(
            model_dir,
            num_cls_label=4,
            num_tag=len(self._tags_to_index),
            ignore_index=self._tags_to_index["O"])
        self._model.eval()

        self._tokenizer = ErnieCtmTokenizer.from_pretrained(model_dir)
        self._summary_num = self._model.ernie_ctm.content_summary_index + 1
        if term_schema_path is not None:
            self._term_schema = self._load_schema(term_schema_path)
        if term_data_path is not None:
            self._term_dict = self._load_term_tree_data(term_data_path)
        if term_data_path is not None and term_schema_path is not None:
            self._linking = True
        else:
            self._linking = False
Exemplo n.º 4
0
                if cls_label_can in name_dict:
                    result['label'] = cls_label_can
                    break
                else:
                    labels_can = bk_tree.search_similar_word(label)
                    result['label'] = labels_can[0][0]

        result['category'] = name_dict[result['label']]
        results.append(result)
    return results


if __name__ == "__main__":
    paddle.set_device(args.device)

    data = [
        '刘德华',
        '快乐薯片',
        '自适应共振理论映射',
    ]

    model = ErnieCtmNptagModel.from_pretrained("nptag")
    tokenizer = ErnieCtmTokenizer.from_pretrained("nptag")

    if args.params_path and os.path.isfile(args.params_path):
        state_dict = paddle.load(args.params_path)
        model.set_dict(state_dict)
        print("Loaded parameters from %s" % args.params_path)

    results = do_predict(data, model, tokenizer, batch_size=args.batch_size)
    print(results)
Exemplo n.º 5
0
def do_train(args):
    paddle.set_device(args.device)
    rank = paddle.distributed.get_rank()
    if paddle.distributed.get_world_size() > 1:
        paddle.distributed.init_parallel_env()

    set_seed(args.seed)

    train_ds = load_dataset(read_custom_data,
                            filename=os.path.join(args.data_dir, "train.txt"),
                            is_test=False,
                            lazy=False)
    dev_ds = load_dataset(read_custom_data,
                          filename=os.path.join(args.data_dir, "dev.txt"),
                          is_test=False,
                          lazy=False)

    tokenizer = ErnieCtmTokenizer.from_pretrained("nptag")
    model = ErnieCtmNptagModel.from_pretrained("nptag")
    vocab_size = model.ernie_ctm.config["vocab_size"]

    trans_func = partial(convert_example,
                         tokenzier=tokenizer,
                         max_seq_len=args.max_seq_len)

    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype='int64'
            ),  # input_ids
        Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype='int64'
            ),  # token_type_ids
        Pad(axis=0, pad_val=-100, dtype='int64'),  # labels
    ): fn(samples)

    train_data_loader = create_dataloader(train_ds,
                                          mode="train",
                                          batch_size=args.batch_size,
                                          batchify_fn=batchify_fn,
                                          trans_fn=trans_func)

    dev_data_loader = create_dataloader(dev_ds,
                                        mode="dev",
                                        batch_size=args.batch_size,
                                        batchify_fn=batchify_fn,
                                        trans_fn=trans_func)

    if args.init_from_ckpt and os.path.isfile(args.init_from_ckpt):
        state_dict = paddle.load(args.init_from_ckpt)
        model.set_dict(state_dict)
    model = paddle.DataParallel(model)
    num_training_steps = len(train_data_loader) * args.num_train_epochs

    lr_scheduler = LinearDecayWithWarmup(args.learning_rate,
                                         num_training_steps,
                                         args.warmup_proportion)

    decay_params = [
        p.name for n, p in model.named_parameters()
        if not any(nd in n for nd in ["bias", "norm"])
    ]
    optimizer = paddle.optimizer.AdamW(
        learning_rate=lr_scheduler,
        epsilon=args.adam_epsilon,
        parameters=model.parameters(),
        weight_decay=args.weight_decay,
        apply_decay_param_fun=lambda x: x in decay_params)

    logger.info("Total steps: %s" % num_training_steps)

    metric = NPTagAccuracy()
    criterion = paddle.nn.CrossEntropyLoss()

    global_step = 0
    for epoch in range(1, args.num_train_epochs + 1):
        logger.info(f"Epoch {epoch} beginnig")
        start_time = time.time()

        for step, batch in enumerate(train_data_loader):
            global_step += 1
            input_ids, token_type_ids, labels = batch
            logits = model(input_ids, token_type_ids)
            loss = criterion(logits.reshape([-1, vocab_size]),
                             labels.reshape([-1]))

            loss.backward()
            optimizer.step()
            optimizer.clear_grad()
            lr_scheduler.step()

            if global_step % args.logging_steps == 0 and rank == 0:
                end_time = time.time()
                speed = float(args.logging_steps) / (end_time - start_time)
                logger.info(
                    "global step %d, epoch: %d, loss: %.5f, speed: %.2f step/s"
                    % (global_step, epoch, loss.numpy().item(), speed))
                start_time = time.time()

            if (global_step % args.save_steps == 0
                    or global_step == num_training_steps) and rank == 0:
                output_dir = os.path.join(args.output_dir,
                                          "model_%d" % (global_step))
                if not os.path.exists(output_dir):
                    os.makedirs(output_dir)
                model._layers.save_pretrained(output_dir)
                tokenizer.save_pretrained(output_dir)

        evaluate(model, metric, criterion, dev_data_loader, vocab_size)
Exemplo n.º 6
0
def do_train(args):
    paddle.set_device(args.device)
    rank = paddle.distributed.get_rank()
    if paddle.distributed.get_world_size() > 1:
        paddle.distributed.init_parallel_env()

    set_seed(args.seed)

    train_ds = load_dataset(read_custom_data,
                            filename=os.path.join(args.data_dir, "train.txt"),
                            is_test=False,
                            lazy=False)
    dev_ds = load_dataset(read_custom_data,
                          filename=os.path.join(args.data_dir, "dev.txt"),
                          is_test=False,
                          lazy=False)
    tags_to_idx = load_dict(os.path.join(args.data_dir, "tags.txt"))

    tokenizer = ErnieCtmTokenizer.from_pretrained("wordtag")
    model = ErnieCtmWordtagModel.from_pretrained("wordtag",
                                                 num_tag=len(tags_to_idx))
    model.crf_loss = LinearChainCrfLoss(
        LinearChainCrf(len(tags_to_idx), 0.1, with_start_stop_tag=False))

    trans_func = partial(convert_example,
                         tokenizer=tokenizer,
                         max_seq_len=args.max_seq_len,
                         tags_to_idx=tags_to_idx)

    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype='int64'
            ),  # input_ids
        Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype='int64'
            ),  # token_type_ids
        Stack(dtype='int64'),  # seq_len
        Pad(axis=0, pad_val=tags_to_idx["O"], dtype='int64'),  # tags
    ): fn(samples)

    train_data_loader = create_dataloader(train_ds,
                                          mode="train",
                                          batch_size=args.batch_size,
                                          batchify_fn=batchify_fn,
                                          trans_fn=trans_func)

    dev_data_loader = create_dataloader(dev_ds,
                                        mode="dev",
                                        batch_size=args.batch_size,
                                        batchify_fn=batchify_fn,
                                        trans_fn=trans_func)

    if args.init_from_ckpt and os.path.isfile(args.init_from_ckpt):
        state_dict = paddle.load(args.init_from_ckpt)
        model.set_dict(state_dict)

    if paddle.distributed.get_world_size() > 1:
        model = paddle.DataParallel(model)

    num_training_steps = len(train_data_loader) * args.num_train_epochs
    warmup = args.warmup_steps if args.warmup_steps > 0 else args.warmup_proportion
    lr_scheduler = LinearDecayWithWarmup(args.learning_rate,
                                         num_training_steps, warmup)

    num_train_optimization_steps = len(
        train_ds) / args.batch_size * args.num_train_epochs

    decay_params = [
        p.name for n, p in model.named_parameters()
        if not any(nd in n for nd in ["bias", "norm"])
    ]
    optimizer = paddle.optimizer.AdamW(
        learning_rate=lr_scheduler,
        epsilon=args.adam_epsilon,
        parameters=model.parameters(),
        weight_decay=args.weight_decay,
        apply_decay_param_fun=lambda x: x in decay_params)

    logger.info("Total steps: %s" % num_training_steps)
    logger.info("WarmUp steps: %s" % warmup)

    metric = SequenceAccuracy()

    total_loss = 0
    global_step = 0

    for epoch in range(1, args.num_train_epochs + 1):
        logger.info(f"Epoch {epoch} beginnig")
        start_time = time.time()

        for total_step, batch in enumerate(train_data_loader):
            global_step += 1
            input_ids, token_type_ids, seq_len, tags = batch

            loss, _ = model(input_ids,
                            token_type_ids,
                            lengths=seq_len,
                            tag_labels=tags)
            loss = loss.mean()
            total_loss += loss
            loss.backward()

            optimizer.step()
            optimizer.clear_grad()
            lr_scheduler.step()

            if global_step % args.logging_steps == 0 and rank == 0:
                end_time = time.time()
                speed = float(args.logging_steps) / (end_time - start_time)
                logger.info(
                    "global step %d, epoch: %d, loss: %.5f, speed: %.2f step/s"
                    % (global_step, epoch, total_loss / args.logging_steps,
                       speed))
                start_time = time.time()
                total_loss = 0

            if (global_step % args.save_steps == 0
                    or global_step == num_training_steps) and rank == 0:
                output_dir = os.path.join(args.output_dir,
                                          "model_%d" % (global_step))
                if not os.path.exists(output_dir):
                    os.makedirs(output_dir)
                model_to_save = model._layers if isinstance(
                    model, paddle.DataParallel) else model
                model_to_save.save_pretrained(output_dir)
                tokenizer.save_pretrained(output_dir)

        evaluate(model, metric, dev_data_loader, tags, tags_to_idx)
Exemplo n.º 7
0
def do_train(args):
    paddle.set_device(args.device)
    if paddle.distributed.get_world_size() > 1:
        paddle.distributed.init_parallel_env()

    train_ds = load_dataset(datafiles=('./data/train.json'))
    tags_to_idx = load_dict("./data/tags.txt")
    labels_to_idx = load_dict("./data/classifier_labels.txt")
    tokenizer = ErnieCtmTokenizer.from_pretrained(args.model_dir)
    trans_func = partial(convert_example,
                         tokenizer=tokenizer,
                         max_seq_len=args.max_seq_len,
                         tags_to_idx=tags_to_idx,
                         labels_to_idx=labels_to_idx)
    train_ds.map(trans_func)

    ignore_label = tags_to_idx["O"]
    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype='int64'
            ),  # input_ids
        Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype='int64'
            ),  # token_type_ids
        Stack(dtype='int64'),  # seq_len
        Pad(axis=0, pad_val=ignore_label, dtype='int64'),  # tags
        Stack(dtype='int64'),  # cls_label
    ): fn(samples)

    train_batch_sampler = paddle.io.DistributedBatchSampler(
        train_ds, batch_size=args.batch_size, shuffle=False, drop_last=True)
    train_data_loader = DataLoader(train_ds,
                                   batch_sampler=train_batch_sampler,
                                   num_workers=0,
                                   collate_fn=batchify_fn,
                                   return_list=True)

    model = ErnieCtmWordtagModel.from_pretrained(
        args.model_dir,
        num_cls_label=len(labels_to_idx),
        num_tag=len(tags_to_idx),
        ignore_index=tags_to_idx["O"])

    if paddle.distributed.get_world_size() > 1:
        model = paddle.DataParallel(model)

    num_training_steps = args.max_steps if args.max_steps > 0 else (
        len(train_data_loader) * args.num_train_epochs)
    warmup = args.warmup_steps if args.warmup_steps > 0 else args.warmup_proportion
    lr_scheduler = LinearDecayWithWarmup(args.learning_rate,
                                         num_training_steps, warmup)

    num_train_optimization_steps = len(
        train_ds) / args.batch_size * args.num_train_epochs

    decay_params = [
        p.name for n, p in model.named_parameters()
        if not any(nd in n for nd in ["bias", "norm"])
    ]
    optimizer = paddle.optimizer.AdamW(
        learning_rate=lr_scheduler,
        epsilon=args.adam_epsilon,
        parameters=model.parameters(),
        weight_decay=args.weight_decay,
        apply_decay_param_fun=lambda x: x in decay_params)

    logger.info("Total steps: %s" % num_training_steps)
    logger.info("WarmUp steps: %s" % warmup)

    cls_acc = paddle.metric.Accuracy()
    seq_acc = SequenceAccuracy()
    total_loss = 0

    global_step = 0

    for epoch in range(1, args.num_train_epochs + 1):
        logger.info(f"Epoch {epoch} beginnig")
        start_time = time.time()

        for total_step, batch in enumerate(train_data_loader):
            global_step += 1
            input_ids, token_type_ids, seq_len, tags, cls_label = batch

            outputs = model(input_ids,
                            token_type_ids,
                            lengths=seq_len,
                            tag_labels=tags,
                            cls_label=cls_label)
            loss, seq_logits, cls_logits = outputs[0], outputs[1], outputs[2]
            loss = loss.mean()
            total_loss += loss
            loss.backward()

            optimizer.step()
            optimizer.clear_grad()
            lr_scheduler.step()

            cls_correct = cls_acc.compute(pred=cls_logits.reshape(
                [-1, len(labels_to_idx)]),
                                          label=cls_label.reshape([-1]))
            cls_acc.update(cls_correct)
            seq_correct = seq_acc.compute(pred=seq_logits.reshape(
                [-1, len(tags_to_idx)]),
                                          label=tags.reshape([-1]),
                                          ignore_index=tags_to_idx["O"])
            seq_acc.update(seq_correct)

            if global_step % args.logging_steps == 0 and global_step != 0:
                end_time = time.time()
                speed = float(args.logging_steps) / (end_time - start_time)
                logger.info(
                    "[Training]["
                    "epoch: %s/%s][step: %s/%s] loss: %6f, Classification Accuracy: %6f, Sequence Labeling Accuracy: %6f, speed: %6f"
                    % (epoch, args.num_train_epochs, global_step,
                       num_training_steps, total_loss / args.logging_steps,
                       cls_acc.accumulate(), seq_acc.accumulate(), speed))
                start_time = time.time()
                cls_acc.reset()
                seq_acc.reset()
                total_loss = 0

            if (global_step % args.save_steps == 0
                    or global_step == num_training_steps
                ) and paddle.distributed.get_rank() == 0:
                output_dir = os.path.join(
                    args.output_dir,
                    "ernie_ctm_ft_model_%d.pdparams" % (global_step))
                if not os.path.exists(output_dir):
                    os.makedirs(output_dir)
                model_to_save = model._layers if isinstance(
                    model, paddle.DataParallel) else model
                model_to_save.save_pretrained(output_dir)
                tokenizer.save_pretrained(output_dir)