예제 #1
0
    def test_save_load_pretrained_additional_features(self):
        processor = LayoutLMv2Processor(feature_extractor=self.get_feature_extractor(), tokenizer=self.get_tokenizer())
        processor.save_pretrained(self.tmpdirname)

        # slow tokenizer
        tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
        feature_extractor_add_kwargs = self.get_feature_extractor(do_resize=False, size=30)

        processor = LayoutLMv2Processor.from_pretrained(
            self.tmpdirname, use_fast=False, bos_token="(BOS)", eos_token="(EOS)", do_resize=False, size=30
        )

        self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
        self.assertIsInstance(processor.tokenizer, LayoutLMv2Tokenizer)

        self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor_add_kwargs.to_json_string())
        self.assertIsInstance(processor.feature_extractor, LayoutLMv2FeatureExtractor)

        # fast tokenizer
        tokenizer_add_kwargs = self.get_rust_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
        feature_extractor_add_kwargs = self.get_feature_extractor(do_resize=False, size=30)

        processor = LayoutLMv2Processor.from_pretrained(
            self.tmpdirname, bos_token="(BOS)", eos_token="(EOS)", do_resize=False, size=30
        )

        self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
        self.assertIsInstance(processor.tokenizer, LayoutLMv2TokenizerFast)

        self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor_add_kwargs.to_json_string())
        self.assertIsInstance(processor.feature_extractor, LayoutLMv2FeatureExtractor)
예제 #2
0
    def test_save_load_pretrained_default(self):
        feature_extractor = self.get_feature_extractor()
        tokenizers = self.get_tokenizers()
        for tokenizer in tokenizers:
            processor = LayoutLMv2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

            processor.save_pretrained(self.tmpdirname)
            processor = LayoutLMv2Processor.from_pretrained(self.tmpdirname)

            self.assertEqual(processor.tokenizer.get_vocab(), tokenizer.get_vocab())
            self.assertIsInstance(processor.tokenizer, (LayoutLMv2Tokenizer, LayoutLMv2TokenizerFast))

            self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor.to_json_string())
            self.assertIsInstance(processor.feature_extractor, LayoutLMv2FeatureExtractor)
예제 #3
0
    def test_overflowing_tokens(self):
        # In the case of overflowing tokens, test that we still have 1-to-1 mapping between the images and input_ids (sequences that are too long are broken down into multiple sequences).

        from datasets import load_dataset

        # set up
        datasets = load_dataset("nielsr/funsd")
        processor = LayoutLMv2Processor.from_pretrained(
            "microsoft/layoutlmv2-base-uncased", revision="no_ocr")

        def preprocess_data(examples):
            images = [
                Image.open(path).convert("RGB")
                for path in examples["image_path"]
            ]
            words = examples["words"]
            boxes = examples["bboxes"]
            word_labels = examples["ner_tags"]
            encoded_inputs = processor(
                images,
                words,
                boxes=boxes,
                word_labels=word_labels,
                padding="max_length",
                truncation=True,
                return_overflowing_tokens=True,
                stride=50,
                return_offsets_mapping=True,
                return_tensors="pt",
            )
            return encoded_inputs

        train_data = preprocess_data(datasets["train"])

        self.assertEqual(len(train_data["image"]),
                         len(train_data["input_ids"]))
예제 #4
0
    if filename.split(".")[-1] in ["png", "jpeg", "jpg"]:
        return "image"
    elif filename.split(".")[-1] in ["pdf"]:
        return "pdf"
    else:
        return "UNSUPPORTED file type"


#-------------simple_invoice_extraction----------------------------------------
#all defined outside the api to not reload exery time
import numpy as np
from transformers import LayoutLMv2Processor, LayoutLMv2ForTokenClassification
from datasets import load_dataset
from PIL import Image, ImageDraw, ImageFont

processor = LayoutLMv2Processor.from_pretrained(
    "microsoft/layoutlmv2-base-uncased")

model_simple_invoice = LayoutLMv2ForTokenClassification.from_pretrained(
    "Theivaprakasham/layoutlmv2-finetuned-sroie_mod")
dataset = load_dataset("darentang/generated", split="test")
labels = dataset.features['ner_tags'].feature.names
print('labels', labels)
id2label = {v: k for v, k in enumerate(labels)}
label2color = {
    'b-abn': "blue",
    'b-biller': "blue",
    'b-biller_address': "black",
    'b-biller_post_code': "green",
    'b-due_date': "orange",
    'b-gst': 'red',
    'b-invoice_date': 'red',