예제 #1
0
    def __init__(self, args):
        self.args = args
        self.max_seq_length = args.max_seq_length

        # init ser token and model
        tokenizer_class, base_model_class, model_class = MODELS[
            args.ser_model_type]
        self.tokenizer = tokenizer_class.from_pretrained(
            args.model_name_or_path)
        self.model = model_class.from_pretrained(args.model_name_or_path)
        self.model.eval()

        # init ocr_engine
        from paddleocr import PaddleOCR

        self.ocr_engine = PaddleOCR(
            rec_model_dir=args.rec_model_dir,
            det_model_dir=args.det_model_dir,
            use_angle_cls=False,
            show_log=False)
        # init dict
        label2id_map, self.id2label_map = get_bio_label_maps(
            args.label_map_path)
        self.label2id_map_for_draw = dict()
        for key in label2id_map:
            if key.startswith("I-"):
                self.label2id_map_for_draw[key] = label2id_map["B" + key[1:]]
            else:
                self.label2id_map_for_draw[key] = label2id_map[key]
예제 #2
0
def eval(args):
    logger = get_logger()
    label2id_map, id2label_map = get_bio_label_maps(args.label_map_path)
    pad_token_label_id = paddle.nn.CrossEntropyLoss().ignore_index

    tokenizer = LayoutXLMTokenizer.from_pretrained(args.model_name_or_path)

    model = LayoutXLMForRelationExtraction.from_pretrained(
        args.model_name_or_path)

    eval_dataset = XFUNDataset(tokenizer,
                               data_dir=args.eval_data_dir,
                               label_path=args.eval_label_path,
                               label2id_map=label2id_map,
                               img_size=(224, 224),
                               max_seq_len=args.max_seq_length,
                               pad_token_label_id=pad_token_label_id,
                               contains_re=True,
                               add_special_ids=False,
                               return_attention_mask=True,
                               load_mode='all')

    eval_dataloader = paddle.io.DataLoader(
        eval_dataset,
        batch_size=args.per_gpu_eval_batch_size,
        num_workers=args.num_workers,
        shuffle=False,
        collate_fn=DataCollator())

    results = evaluate(model, eval_dataloader, logger)
    logger.info("eval results: {}".format(results))
예제 #3
0
def postprocess(attention_mask, preds, label_map_path):
    if isinstance(preds, paddle.Tensor):
        preds = preds.numpy()
    preds = np.argmax(preds, axis=2)

    _, label_map = get_bio_label_maps(label_map_path)

    preds_list = [[] for _ in range(preds.shape[0])]

    # keep batch info
    for i in range(preds.shape[0]):
        for j in range(preds.shape[1]):
            if attention_mask[i][j] == 1:
                preds_list[i].append(label_map[preds[i][j]])

    return preds_list
예제 #4
0
def merge_preds_list_with_ocr_info(label_map_path, ocr_info, segment_offset_id,
                                   preds_list):
    # must ensure the preds_list is generated from the same image
    preds = [p for pred in preds_list for p in pred]
    label2id_map, _ = get_bio_label_maps(label_map_path)
    for key in label2id_map:
        if key.startswith("I-"):
            label2id_map[key] = label2id_map["B" + key[1:]]

    id2label_map = dict()
    for key in label2id_map:
        val = label2id_map[key]
        if key == "O":
            id2label_map[val] = key
        if key.startswith("B-") or key.startswith("I-"):
            id2label_map[val] = key[2:]
        else:
            id2label_map[val] = key

    for idx in range(len(segment_offset_id)):
        if idx == 0:
            start_id = 0
        else:
            start_id = segment_offset_id[idx - 1]

        end_id = segment_offset_id[idx]

        curr_pred = preds[start_id:end_id]
        curr_pred = [label2id_map[p] for p in curr_pred]

        if len(curr_pred) <= 0:
            pred_id = 0
        else:
            counts = np.bincount(curr_pred)
            pred_id = np.argmax(counts)
        ocr_info[idx]["pred_id"] = int(pred_id)
        ocr_info[idx]["pred"] = id2label_map[pred_id]
    return ocr_info
예제 #5
0
def eval(args):
    logger = get_logger()
    print_arguments(args, logger)

    label2id_map, id2label_map = get_bio_label_maps(args.label_map_path)
    pad_token_label_id = paddle.nn.CrossEntropyLoss().ignore_index

    tokenizer_class, base_model_class, model_class = MODELS[
        args.ser_model_type]
    tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path)
    model = model_class.from_pretrained(args.model_name_or_path)

    eval_dataset = XFUNDataset(tokenizer,
                               data_dir=args.eval_data_dir,
                               label_path=args.eval_label_path,
                               label2id_map=label2id_map,
                               img_size=(224, 224),
                               pad_token_label_id=pad_token_label_id,
                               contains_re=False,
                               add_special_ids=False,
                               return_attention_mask=True,
                               load_mode='all')

    eval_dataloader = paddle.io.DataLoader(
        eval_dataset,
        batch_size=args.per_gpu_eval_batch_size,
        num_workers=args.num_workers,
        use_shared_memory=True,
        collate_fn=None,
    )

    loss_class = SERLoss(len(label2id_map))

    results, _ = evaluate(args, model, tokenizer, loss_class, eval_dataloader,
                          label2id_map, id2label_map, pad_token_label_id,
                          logger)

    logger.info(results)
예제 #6
0
def train(args):
    os.makedirs(args.output_dir, exist_ok=True)
    rank = paddle.distributed.get_rank()
    distributed = paddle.distributed.get_world_size() > 1

    logger = get_logger(log_file=os.path.join(args.output_dir, "train.log"))
    print_arguments(args, logger)

    label2id_map, id2label_map = get_bio_label_maps(args.label_map_path)
    loss_class = SERLoss(len(label2id_map))

    pad_token_label_id = loss_class.ignore_index

    # dist mode
    if distributed:
        paddle.distributed.init_parallel_env()

    tokenizer_class, base_model_class, model_class = MODELS[args.ser_model_type]
    tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path)
    if not args.resume:
        base_model = base_model_class.from_pretrained(args.model_name_or_path)
        model = model_class(
            base_model, num_classes=len(label2id_map), dropout=None)
        logger.info('train from scratch')
    else:
        logger.info('resume from {}'.format(args.model_name_or_path))
        model = model_class.from_pretrained(args.model_name_or_path)

    # dist mode
    if distributed:
        model = paddle.DataParallel(model)

    train_dataset = XFUNDataset(
        tokenizer,
        data_dir=args.train_data_dir,
        label_path=args.train_label_path,
        label2id_map=label2id_map,
        img_size=(224, 224),
        pad_token_label_id=pad_token_label_id,
        contains_re=False,
        add_special_ids=False,
        return_attention_mask=True,
        load_mode='all')
    eval_dataset = XFUNDataset(
        tokenizer,
        data_dir=args.eval_data_dir,
        label_path=args.eval_label_path,
        label2id_map=label2id_map,
        img_size=(224, 224),
        pad_token_label_id=pad_token_label_id,
        contains_re=False,
        add_special_ids=False,
        return_attention_mask=True,
        load_mode='all')

    train_sampler = paddle.io.DistributedBatchSampler(
        train_dataset, batch_size=args.per_gpu_train_batch_size, shuffle=True)

    train_dataloader = paddle.io.DataLoader(
        train_dataset,
        batch_sampler=train_sampler,
        num_workers=args.num_workers,
        use_shared_memory=True,
        collate_fn=None, )

    eval_dataloader = paddle.io.DataLoader(
        eval_dataset,
        batch_size=args.per_gpu_eval_batch_size,
        num_workers=args.num_workers,
        use_shared_memory=True,
        collate_fn=None, )

    t_total = len(train_dataloader) * args.num_train_epochs

    # build linear decay with warmup lr sch
    lr_scheduler = paddle.optimizer.lr.PolynomialDecay(
        learning_rate=args.learning_rate,
        decay_steps=t_total,
        end_lr=0.0,
        power=1.0)
    if args.warmup_steps > 0:
        lr_scheduler = paddle.optimizer.lr.LinearWarmup(
            lr_scheduler,
            args.warmup_steps,
            start_lr=0,
            end_lr=args.learning_rate, )

    optimizer = paddle.optimizer.AdamW(
        learning_rate=lr_scheduler,
        parameters=model.parameters(),
        epsilon=args.adam_epsilon,
        weight_decay=args.weight_decay)

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d",
                args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed) = %d",
        args.per_gpu_train_batch_size * paddle.distributed.get_world_size(), )
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    tr_loss = 0.0
    set_seed(args.seed)
    best_metrics = None

    train_reader_cost = 0.0
    train_run_cost = 0.0
    total_samples = 0
    reader_start = time.time()

    print_step = 1
    model.train()
    for epoch_id in range(args.num_train_epochs):
        for step, batch in enumerate(train_dataloader):
            train_reader_cost += time.time() - reader_start

            if args.ser_model_type == 'LayoutLM':
                if 'image' in batch:
                    batch.pop('image')
            labels = batch.pop('labels')

            train_start = time.time()
            outputs = model(**batch)
            train_run_cost += time.time() - train_start
            if args.ser_model_type == 'LayoutXLM':
                outputs = outputs[0]
            loss = loss_class(labels, outputs, batch['attention_mask'])

            # model outputs are always tuple in ppnlp (see doc)
            loss = loss.mean()
            loss.backward()
            tr_loss += loss.item()
            optimizer.step()
            lr_scheduler.step()  # Update learning rate schedule
            optimizer.clear_grad()
            global_step += 1
            total_samples += batch['input_ids'].shape[0]

            if rank == 0 and step % print_step == 0:
                logger.info(
                    "epoch: [{}/{}], iter: [{}/{}], global_step:{}, train loss: {:.6f}, lr: {:.6f}, avg_reader_cost: {:.5f} sec, avg_batch_cost: {:.5f} sec, avg_samples: {:.5f}, ips: {:.5f} images/sec".
                    format(epoch_id, args.num_train_epochs, step,
                           len(train_dataloader), global_step,
                           loss.numpy()[0],
                           lr_scheduler.get_lr(), train_reader_cost /
                           print_step, (train_reader_cost + train_run_cost) /
                           print_step, total_samples / print_step, total_samples
                           / (train_reader_cost + train_run_cost)))

                train_reader_cost = 0.0
                train_run_cost = 0.0
                total_samples = 0

            if rank == 0 and args.eval_steps > 0 and global_step % args.eval_steps == 0 and args.evaluate_during_training:
                # Log metrics
                # Only evaluate when single GPU otherwise metrics may not average well
                results, _ = evaluate(args, model, tokenizer, loss_class,
                                      eval_dataloader, label2id_map,
                                      id2label_map, pad_token_label_id, logger)

                if best_metrics is None or results["f1"] >= best_metrics["f1"]:
                    best_metrics = copy.deepcopy(results)
                    output_dir = os.path.join(args.output_dir, "best_model")
                    os.makedirs(output_dir, exist_ok=True)
                    if distributed:
                        model._layers.save_pretrained(output_dir)
                    else:
                        model.save_pretrained(output_dir)
                    tokenizer.save_pretrained(output_dir)
                    paddle.save(args,
                                os.path.join(output_dir, "training_args.bin"))
                    logger.info("Saving model checkpoint to {}".format(
                        output_dir))

                logger.info("[epoch {}/{}][iter: {}/{}] results: {}".format(
                    epoch_id, args.num_train_epochs, step,
                    len(train_dataloader), results))
                if best_metrics is not None:
                    logger.info("best metrics: {}".format(best_metrics))
            reader_start = time.time()
        if rank == 0:
            # Save model checkpoint
            output_dir = os.path.join(args.output_dir, "latest_model")
            os.makedirs(output_dir, exist_ok=True)
            if distributed:
                model._layers.save_pretrained(output_dir)
            else:
                model.save_pretrained(output_dir)
            tokenizer.save_pretrained(output_dir)
            paddle.save(args, os.path.join(output_dir, "training_args.bin"))
            logger.info("Saving model checkpoint to {}".format(output_dir))
    return global_step, tr_loss / global_step
예제 #7
0
def train(args):
    logger = get_logger(log_file=os.path.join(args.output_dir, "train.log"))
    rank = paddle.distributed.get_rank()
    distributed = paddle.distributed.get_world_size() > 1

    print_arguments(args, logger)

    # Added here for reproducibility (even between python 2 and 3)
    set_seed(args.seed)

    label2id_map, id2label_map = get_bio_label_maps(args.label_map_path)
    pad_token_label_id = paddle.nn.CrossEntropyLoss().ignore_index

    # dist mode
    if distributed:
        paddle.distributed.init_parallel_env()

    tokenizer = LayoutXLMTokenizer.from_pretrained(args.model_name_or_path)
    if not args.resume:
        model = LayoutXLMModel.from_pretrained(args.model_name_or_path)
        model = LayoutXLMForRelationExtraction(model, dropout=None)
        logger.info('train from scratch')
    else:
        logger.info('resume from {}'.format(args.model_name_or_path))
        model = LayoutXLMForRelationExtraction.from_pretrained(
            args.model_name_or_path)

    # dist mode
    if distributed:
        model = paddle.DataParallel(model)

    train_dataset = XFUNDataset(
        tokenizer,
        data_dir=args.train_data_dir,
        label_path=args.train_label_path,
        label2id_map=label2id_map,
        img_size=(224, 224),
        max_seq_len=args.max_seq_length,
        pad_token_label_id=pad_token_label_id,
        contains_re=True,
        add_special_ids=False,
        return_attention_mask=True,
        load_mode='all')

    eval_dataset = XFUNDataset(
        tokenizer,
        data_dir=args.eval_data_dir,
        label_path=args.eval_label_path,
        label2id_map=label2id_map,
        img_size=(224, 224),
        max_seq_len=args.max_seq_length,
        pad_token_label_id=pad_token_label_id,
        contains_re=True,
        add_special_ids=False,
        return_attention_mask=True,
        load_mode='all')

    train_sampler = paddle.io.DistributedBatchSampler(
        train_dataset, batch_size=args.per_gpu_train_batch_size, shuffle=True)

    train_dataloader = paddle.io.DataLoader(
        train_dataset,
        batch_sampler=train_sampler,
        num_workers=args.num_workers,
        use_shared_memory=True,
        collate_fn=DataCollator())

    eval_dataloader = paddle.io.DataLoader(
        eval_dataset,
        batch_size=args.per_gpu_eval_batch_size,
        num_workers=args.num_workers,
        shuffle=False,
        collate_fn=DataCollator())

    t_total = len(train_dataloader) * args.num_train_epochs

    # build linear decay with warmup lr sch
    lr_scheduler = paddle.optimizer.lr.PolynomialDecay(
        learning_rate=args.learning_rate,
        decay_steps=t_total,
        end_lr=0.0,
        power=1.0)
    if args.warmup_steps > 0:
        lr_scheduler = paddle.optimizer.lr.LinearWarmup(
            lr_scheduler,
            args.warmup_steps,
            start_lr=0,
            end_lr=args.learning_rate, )
    grad_clip = paddle.nn.ClipGradByNorm(clip_norm=10)
    optimizer = paddle.optimizer.Adam(
        learning_rate=args.learning_rate,
        parameters=model.parameters(),
        epsilon=args.adam_epsilon,
        grad_clip=grad_clip,
        weight_decay=args.weight_decay)

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = {}".format(len(train_dataset)))
    logger.info("  Num Epochs = {}".format(args.num_train_epochs))
    logger.info("  Instantaneous batch size per GPU = {}".format(
        args.per_gpu_train_batch_size))
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = {}".
        format(args.per_gpu_train_batch_size *
               paddle.distributed.get_world_size()))
    logger.info("  Total optimization steps = {}".format(t_total))

    global_step = 0
    model.clear_gradients()
    train_dataloader_len = len(train_dataloader)
    best_metirc = {'f1': 0}
    model.train()

    train_reader_cost = 0.0
    train_run_cost = 0.0
    total_samples = 0
    reader_start = time.time()

    print_step = 1

    for epoch in range(int(args.num_train_epochs)):
        for step, batch in enumerate(train_dataloader):
            train_reader_cost += time.time() - reader_start
            train_start = time.time()
            outputs = model(**batch)
            train_run_cost += time.time() - train_start
            # model outputs are always tuple in ppnlp (see doc)
            loss = outputs['loss']
            loss = loss.mean()

            loss.backward()
            optimizer.step()
            optimizer.clear_grad()
            # lr_scheduler.step()  # Update learning rate schedule

            global_step += 1
            total_samples += batch['image'].shape[0]

            if rank == 0 and step % print_step == 0:
                logger.info(
                    "epoch: [{}/{}], iter: [{}/{}], global_step:{}, train loss: {:.6f}, lr: {:.6f}, avg_reader_cost: {:.5f} sec, avg_batch_cost: {:.5f} sec, avg_samples: {:.5f}, ips: {:.5f} images/sec".
                    format(epoch, args.num_train_epochs, step,
                           train_dataloader_len, global_step,
                           np.mean(loss.numpy()),
                           optimizer.get_lr(), train_reader_cost / print_step, (
                               train_reader_cost + train_run_cost) / print_step,
                           total_samples / print_step, total_samples / (
                               train_reader_cost + train_run_cost)))

                train_reader_cost = 0.0
                train_run_cost = 0.0
                total_samples = 0

            if rank == 0 and args.eval_steps > 0 and global_step % args.eval_steps == 0 and args.evaluate_during_training:
                # Log metrics
                # Only evaluate when single GPU otherwise metrics may not average well
                results = evaluate(model, eval_dataloader, logger)
                if results['f1'] >= best_metirc['f1']:
                    best_metirc = results
                    output_dir = os.path.join(args.output_dir, "best_model")
                    os.makedirs(output_dir, exist_ok=True)
                    if distributed:
                        model._layers.save_pretrained(output_dir)
                    else:
                        model.save_pretrained(output_dir)
                    tokenizer.save_pretrained(output_dir)
                    paddle.save(args,
                                os.path.join(output_dir, "training_args.bin"))
                    logger.info("Saving model checkpoint to {}".format(
                        output_dir))
                logger.info("eval results: {}".format(results))
                logger.info("best_metirc: {}".format(best_metirc))
            reader_start = time.time()

        if rank == 0:
            # Save model checkpoint
            output_dir = os.path.join(args.output_dir, "latest_model")
            os.makedirs(output_dir, exist_ok=True)
            if distributed:
                model._layers.save_pretrained(output_dir)
            else:
                model.save_pretrained(output_dir)
            tokenizer.save_pretrained(output_dir)
            paddle.save(args, os.path.join(output_dir, "training_args.bin"))
            logger.info("Saving model checkpoint to {}".format(output_dir))
    logger.info("best_metirc: {}".format(best_metirc))
예제 #8
0
def infer(args):
    os.makedirs(args.output_dir, exist_ok=True)
    logger = get_logger()
    label2id_map, id2label_map = get_bio_label_maps(args.label_map_path)
    pad_token_label_id = paddle.nn.CrossEntropyLoss().ignore_index

    tokenizer = LayoutXLMTokenizer.from_pretrained(args.model_name_or_path)

    model = LayoutXLMForRelationExtraction.from_pretrained(
        args.model_name_or_path)

    eval_dataset = XFUNDataset(
        tokenizer,
        data_dir=args.eval_data_dir,
        label_path=args.eval_label_path,
        label2id_map=label2id_map,
        img_size=(224, 224),
        max_seq_len=args.max_seq_length,
        pad_token_label_id=pad_token_label_id,
        contains_re=True,
        add_special_ids=False,
        return_attention_mask=True,
        load_mode='all')

    eval_dataloader = paddle.io.DataLoader(
        eval_dataset,
        batch_size=args.per_gpu_eval_batch_size,
        num_workers=8,
        shuffle=False,
        collate_fn=DataCollator())

    # 读取gt的oct数据
    ocr_info_list = load_ocr(args.eval_data_dir, args.eval_label_path)

    for idx, batch in enumerate(eval_dataloader):
        ocr_info = ocr_info_list[idx]
        image_path = ocr_info['image_path']
        ocr_info = ocr_info['ocr_info']

        save_img_path = os.path.join(
            args.output_dir,
            os.path.splitext(os.path.basename(image_path))[0] + "_re.jpg")
        logger.info("[Infer] process: {}/{}, save result to {}".format(
            idx, len(eval_dataloader), save_img_path))
        with paddle.no_grad():
            outputs = model(**batch)
        pred_relations = outputs['pred_relations']

        # 根据entity里的信息,做token解码后去过滤不要的ocr_info
        ocr_info = filter_bg_by_txt(ocr_info, batch, tokenizer)

        # 进行 relations 到 ocr信息的转换
        result = []
        used_tail_id = []
        for relations in pred_relations:
            for relation in relations:
                if relation['tail_id'] in used_tail_id:
                    continue
                if relation['head_id'] not in ocr_info or relation[
                        'tail_id'] not in ocr_info:
                    continue
                used_tail_id.append(relation['tail_id'])
                ocr_info_head = ocr_info[relation['head_id']]
                ocr_info_tail = ocr_info[relation['tail_id']]
                result.append((ocr_info_head, ocr_info_tail))

        img = cv2.imread(image_path)
        img_show = draw_re_results(img, result)
        cv2.imwrite(save_img_path, img_show)