예제 #1
0
    def test_processor_case_1(self):
        # case 1: document image classification (training, inference) + token classification (inference), apply_ocr = True

        feature_extractor = LayoutLMv2FeatureExtractor()
        tokenizers = self.get_tokenizers
        images = self.get_images

        for tokenizer in tokenizers:
            processor = LayoutXLMProcessor(feature_extractor=feature_extractor,
                                           tokenizer=tokenizer)

            # not batched
            input_feat_extract = feature_extractor(images[0],
                                                   return_tensors="pt")
            input_processor = processor(images[0], return_tensors="pt")

            # verify keys
            expected_keys = ["attention_mask", "bbox", "image", "input_ids"]
            actual_keys = sorted(list(input_processor.keys()))
            self.assertListEqual(actual_keys, expected_keys)

            # verify image
            self.assertAlmostEqual(input_feat_extract["pixel_values"].sum(),
                                   input_processor["image"].sum(),
                                   delta=1e-2)

            # verify input_ids
            # this was obtained with Tesseract 4.1.1
            # fmt: off
            expected_decoding = "<s> 11:14 to 11:39 a.m 11:39 to 11:44 a.m. 11:44 a.m. to 12:25 p.m. 12:25 to 12:58 p.m. 12:58 to 4:00 p.m. 2:00 to 5:00 p.m. Coffee Break Coffee will be served for men and women in the lobby adjacent to exhibit area. Please move into exhibit area. (Exhibits Open) TRRF GENERAL SESSION (PART |) Presiding: Lee A. Waller TRRF Vice President “Introductory Remarks” Lee A. Waller, TRRF Vice Presi- dent Individual Interviews with TRRF Public Board Members and Sci- entific Advisory Council Mem- bers Conducted by TRRF Treasurer Philip G. Kuehn to get answers which the public refrigerated warehousing industry is looking for. Plus questions from the floor. Dr. Emil M. Mrak, University of Cal- ifornia, Chairman, TRRF Board; Sam R. Cecil, University of Georgia College of Agriculture; Dr. Stanley Charm, Tufts University School of Medicine; Dr. Robert H. Cotton, ITT Continental Baking Company; Dr. Owen Fennema, University of Wis- consin; Dr. Robert E. Hardenburg, USDA. Questions and Answers Exhibits Open Capt. Jack Stoney Room TRRF Scientific Advisory Council Meeting Ballroom Foyer</s>"  # noqa: E231
            # fmt: on
            decoding = processor.decode(
                input_processor.input_ids.squeeze().tolist())
            self.assertSequenceEqual(decoding, expected_decoding)

            # batched
            input_feat_extract = feature_extractor(images, return_tensors="pt")
            input_processor = processor(images,
                                        padding=True,
                                        return_tensors="pt")

            # verify keys
            expected_keys = ["attention_mask", "bbox", "image", "input_ids"]
            actual_keys = sorted(list(input_processor.keys()))
            self.assertListEqual(actual_keys, expected_keys)

            # verify images
            self.assertAlmostEqual(input_feat_extract["pixel_values"].sum(),
                                   input_processor["image"].sum(),
                                   delta=1e-2)

            # verify input_ids
            # this was obtained with Tesseract 4.1.1
            # fmt: off
            expected_decoding = "<s> 7 ITC Limited REPORT AND ACCOUNTS 2013 ITC’s Brands: An Asset for the Nation The consumer needs and aspirations they fulfil, the benefit they generate for millions across ITC’s value chains, the future-ready capabilities that support them, and the value that they create for the country, have made ITC’s brands national assets, adding to India’s competitiveness. It is ITC’s aspiration to be the No 1 FMCG player in the country, driven by its new FMCG businesses. A recent Nielsen report has highlighted that ITC's new FMCG businesses are the fastest growing among the top consumer goods companies operating in India. ITC takes justifiable pride that, along with generating economic value, these celebrated Indian brands also drive the creation of larger societal capital through the virtuous cycle of sustainable and inclusive growth. DI WILLS * ; LOVE DELIGHTFULLY SOFT SKIN? aia Ans Source: https://www.industrydocuments.ucsf.edu/docs/snbx0223</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>"  # noqa: E231
            # fmt: on
            decoding = processor.decode(input_processor.input_ids[1].tolist())
            self.assertSequenceEqual(decoding, expected_decoding)
예제 #2
0
    def test_save_load_pretrained_default(self):
        feature_extractor = self.get_feature_extractor()
        tokenizers = self.get_tokenizers()
        for tokenizer in tokenizers:
            processor = LayoutXLMProcessor(feature_extractor=feature_extractor,
                                           tokenizer=tokenizer)

            processor.save_pretrained(self.tmpdirname)
            processor = LayoutXLMProcessor.from_pretrained(self.tmpdirname)

            self.assertEqual(processor.tokenizer.get_vocab(),
                             tokenizer.get_vocab())
            self.assertIsInstance(processor.tokenizer,
                                  (LayoutXLMTokenizer, LayoutXLMTokenizerFast))

            self.assertEqual(processor.feature_extractor.to_json_string(),
                             feature_extractor.to_json_string())
            self.assertIsInstance(processor.feature_extractor,
                                  LayoutLMv2FeatureExtractor)
예제 #3
0
    def test_save_load_pretrained_additional_features(self):
        processor = LayoutXLMProcessor(
            feature_extractor=self.get_feature_extractor(),
            tokenizer=self.get_tokenizer())
        processor.save_pretrained(self.tmpdirname)

        # slow tokenizer
        tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)",
                                                  eos_token="(EOS)")
        feature_extractor_add_kwargs = self.get_feature_extractor(
            do_resize=False, size=30)

        processor = LayoutXLMProcessor.from_pretrained(
            self.tmpdirname,
            use_fast=False,
            bos_token="(BOS)",
            eos_token="(EOS)",
            do_resize=False,
            size=30,
        )

        self.assertEqual(processor.tokenizer.get_vocab(),
                         tokenizer_add_kwargs.get_vocab())
        self.assertIsInstance(processor.tokenizer, LayoutXLMTokenizer)

        self.assertEqual(processor.feature_extractor.to_json_string(),
                         feature_extractor_add_kwargs.to_json_string())
        self.assertIsInstance(processor.feature_extractor,
                              LayoutLMv2FeatureExtractor)

        # fast tokenizer
        tokenizer_add_kwargs = self.get_rust_tokenizer(bos_token="(BOS)",
                                                       eos_token="(EOS)")
        feature_extractor_add_kwargs = self.get_feature_extractor(
            do_resize=False, size=30)

        processor = LayoutXLMProcessor.from_pretrained(self.tmpdirname,
                                                       use_xlm=True,
                                                       bos_token="(BOS)",
                                                       eos_token="(EOS)",
                                                       do_resize=False,
                                                       size=30)

        self.assertEqual(processor.tokenizer.get_vocab(),
                         tokenizer_add_kwargs.get_vocab())
        self.assertIsInstance(processor.tokenizer, LayoutXLMTokenizerFast)

        self.assertEqual(processor.feature_extractor.to_json_string(),
                         feature_extractor_add_kwargs.to_json_string())
        self.assertIsInstance(processor.feature_extractor,
                              LayoutLMv2FeatureExtractor)
예제 #4
0
    def test_processor_case_5(self):
        # case 5: visual question answering (inference), apply_ocr=False

        feature_extractor = LayoutLMv2FeatureExtractor(apply_ocr=False)
        tokenizers = self.get_tokenizers
        images = self.get_images

        for tokenizer in tokenizers:
            processor = LayoutXLMProcessor(feature_extractor=feature_extractor,
                                           tokenizer=tokenizer)

            # not batched
            question = "What's his name?"
            words = ["hello", "world"]
            boxes = [[1, 2, 3, 4], [5, 6, 7, 8]]
            input_processor = processor(images[0],
                                        question,
                                        words,
                                        boxes,
                                        return_tensors="pt")

            # verify keys
            expected_keys = ["attention_mask", "bbox", "image", "input_ids"]
            actual_keys = sorted(list(input_processor.keys()))
            self.assertListEqual(actual_keys, expected_keys)

            # verify input_ids
            expected_decoding = "<s> What's his name?</s></s> hello world</s>"
            decoding = tokenizer.decode(
                input_processor.input_ids.squeeze().tolist())
            self.assertSequenceEqual(decoding, expected_decoding)

            # batched
            questions = ["How old is he?", "what's the time"]
            words = [["hello", "world"], ["my", "name", "is", "niels"]]
            boxes = [[[1, 2, 3, 4], [5, 6, 7, 8]],
                     [[3, 2, 5, 1], [6, 7, 4, 2], [3, 9, 2, 4], [1, 1, 2, 3]]]
            input_processor = processor(images,
                                        questions,
                                        words,
                                        boxes,
                                        padding=True,
                                        return_tensors="pt")

            # verify keys
            expected_keys = ["attention_mask", "bbox", "image", "input_ids"]
            actual_keys = sorted(list(input_processor.keys()))
            self.assertListEqual(actual_keys, expected_keys)

            # verify input_ids
            expected_decoding = "<s> How old is he?</s></s> hello world</s><pad><pad>"
            decoding = tokenizer.decode(input_processor.input_ids[0].tolist())
            self.assertSequenceEqual(decoding, expected_decoding)

            expected_decoding = "<s> what's the time</s></s> my name is niels</s>"
            decoding = tokenizer.decode(input_processor.input_ids[1].tolist())
            self.assertSequenceEqual(decoding, expected_decoding)

            # verify bbox
            expected_bbox = [[6, 7, 4, 2], [3, 9, 2, 4], [1, 1, 2, 3],
                             [1, 1, 2, 3], [1000, 1000, 1000, 1000]]
            self.assertListEqual(input_processor.bbox[1].tolist()[-5:],
                                 expected_bbox)
예제 #5
0
    def test_processor_case_4(self):
        # case 4: visual question answering (inference), apply_ocr=True

        feature_extractor = LayoutLMv2FeatureExtractor()
        tokenizers = self.get_tokenizers
        images = self.get_images

        for tokenizer in tokenizers:
            processor = LayoutXLMProcessor(feature_extractor=feature_extractor,
                                           tokenizer=tokenizer)

            # not batched
            question = "What's his name?"
            input_processor = processor(images[0],
                                        question,
                                        return_tensors="pt")

            # verify keys
            expected_keys = ["attention_mask", "bbox", "image", "input_ids"]
            actual_keys = sorted(list(input_processor.keys()))
            self.assertListEqual(actual_keys, expected_keys)

            # verify input_ids
            # fmt: off
            expected_decoding = "<s> What's his name?</s></s> 11:14 to 11:39 a.m 11:39 to 11:44 a.m. 11:44 a.m. to 12:25 p.m. 12:25 to 12:58 p.m. 12:58 to 4:00 p.m. 2:00 to 5:00 p.m. Coffee Break Coffee will be served for men and women in the lobby adjacent to exhibit area. Please move into exhibit area. (Exhibits Open) TRRF GENERAL SESSION (PART |) Presiding: Lee A. Waller TRRF Vice President “Introductory Remarks” Lee A. Waller, TRRF Vice Presi- dent Individual Interviews with TRRF Public Board Members and Sci- entific Advisory Council Mem- bers Conducted by TRRF Treasurer Philip G. Kuehn to get answers which the public refrigerated warehousing industry is looking for. Plus questions from the floor. Dr. Emil M. Mrak, University of Cal- ifornia, Chairman, TRRF Board; Sam R. Cecil, University of Georgia College of Agriculture; Dr. Stanley Charm, Tufts University School of Medicine; Dr. Robert H. Cotton, ITT Continental Baking Company; Dr. Owen Fennema, University of Wis- consin; Dr. Robert E. Hardenburg, USDA. Questions and Answers Exhibits Open Capt. Jack Stoney Room TRRF Scientific Advisory Council Meeting Ballroom Foyer</s>"  # noqa: E231
            # fmt: on
            decoding = tokenizer.decode(
                input_processor.input_ids.squeeze().tolist())
            self.assertSequenceEqual(decoding, expected_decoding)

            # batched
            questions = ["How old is he?", "what's the time"]
            input_processor = processor(images,
                                        questions,
                                        padding="max_length",
                                        max_length=20,
                                        truncation=True,
                                        return_tensors="pt")

            # verify keys
            expected_keys = ["attention_mask", "bbox", "image", "input_ids"]
            actual_keys = sorted(list(input_processor.keys()))
            self.assertListEqual(actual_keys, expected_keys)

            # verify input_ids
            expected_decoding = "<s> what's the time</s></s> 7 ITC Limited REPORT AND ACCOUNTS 2013</s>"
            decoding = tokenizer.decode(input_processor.input_ids[1].tolist())
            self.assertSequenceEqual(decoding, expected_decoding)

            # verify bbox
            # fmt: off
            expected_bbox = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0],
                             [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0],
                             [1000, 1000, 1000, 1000],
                             [1000, 1000, 1000, 1000], [0, 45, 67, 80],
                             [72, 56, 109, 67], [72, 56, 109, 67],
                             [116, 56, 189, 67], [198, 59, 253, 66],
                             [257, 59, 285, 66], [289, 59, 365, 66],
                             [289, 59, 365, 66], [289, 59, 365, 66],
                             [289, 59, 365, 66], [372, 59, 407, 66],
                             [1000, 1000, 1000, 1000]]  # noqa: E231
            # fmt: on
            self.assertListEqual(input_processor.bbox[1].tolist(),
                                 expected_bbox)
예제 #6
0
    def test_processor_case_3(self):
        # case 3: token classification (training), apply_ocr=False

        feature_extractor = LayoutLMv2FeatureExtractor(apply_ocr=False)
        tokenizers = self.get_tokenizers
        images = self.get_images

        for tokenizer in tokenizers:
            processor = LayoutXLMProcessor(feature_extractor=feature_extractor,
                                           tokenizer=tokenizer)

            # not batched
            words = ["weirdly", "world"]
            boxes = [[1, 2, 3, 4], [5, 6, 7, 8]]
            word_labels = [1, 2]
            input_processor = processor(images[0],
                                        words,
                                        boxes=boxes,
                                        word_labels=word_labels,
                                        return_tensors="pt")

            # verify keys
            expected_keys = [
                "attention_mask", "bbox", "image", "input_ids", "labels"
            ]
            actual_keys = sorted(list(input_processor.keys()))
            self.assertListEqual(actual_keys, expected_keys)

            # verify input_ids
            expected_decoding = "<s> weirdly world</s>"
            decoding = tokenizer.decode(
                input_processor.input_ids.squeeze().tolist())
            self.assertSequenceEqual(decoding, expected_decoding)

            # verify labels
            expected_labels = [-100, 1, -100, 2, -100]
            self.assertListEqual(input_processor.labels.squeeze().tolist(),
                                 expected_labels)

            # batched
            words = [["hello", "world"], ["my", "name", "is", "niels"]]
            boxes = [[[1, 2, 3, 4], [5, 6, 7, 8]],
                     [[3, 2, 5, 1], [6, 7, 4, 2], [3, 9, 2, 4], [1, 1, 2, 3]]]
            word_labels = [[1, 2], [6, 3, 10, 2]]
            input_processor = processor(images,
                                        words,
                                        boxes=boxes,
                                        word_labels=word_labels,
                                        padding=True,
                                        return_tensors="pt")

            # verify keys
            expected_keys = [
                "attention_mask", "bbox", "image", "input_ids", "labels"
            ]
            actual_keys = sorted(list(input_processor.keys()))
            self.assertListEqual(actual_keys, expected_keys)

            # verify input_ids
            expected_decoding = "<s> my name is niels</s>"
            decoding = tokenizer.decode(input_processor.input_ids[1].tolist())
            self.assertSequenceEqual(decoding, expected_decoding)

            # verify bbox
            expected_bbox = [
                [0, 0, 0, 0],
                [3, 2, 5, 1],
                [6, 7, 4, 2],
                [3, 9, 2, 4],
                [1, 1, 2, 3],
                [1, 1, 2, 3],
                [1000, 1000, 1000, 1000],
            ]
            self.assertListEqual(input_processor.bbox[1].tolist(),
                                 expected_bbox)

            # verify labels
            expected_labels = [-100, 6, 3, 10, 2, -100, -100]
            self.assertListEqual(input_processor.labels[1].tolist(),
                                 expected_labels)
예제 #7
0
    def test_processor_case_2(self):
        # case 2: document image classification (training, inference) + token classification (inference), apply_ocr=False

        feature_extractor = LayoutLMv2FeatureExtractor(apply_ocr=False)
        tokenizers = self.get_tokenizers
        images = self.get_images

        for tokenizer in tokenizers:
            processor = LayoutXLMProcessor(feature_extractor=feature_extractor,
                                           tokenizer=tokenizer)

            # not batched
            words = ["hello", "world"]
            boxes = [[1, 2, 3, 4], [5, 6, 7, 8]]
            input_processor = processor(images[0],
                                        words,
                                        boxes=boxes,
                                        return_tensors="pt")

            # verify keys
            expected_keys = ["input_ids", "bbox", "attention_mask", "image"]
            actual_keys = list(input_processor.keys())
            for key in expected_keys:
                self.assertIn(key, actual_keys)

            # verify input_ids
            expected_decoding = "<s> hello world</s>"
            decoding = tokenizer.decode(
                input_processor.input_ids.squeeze().tolist())
            self.assertSequenceEqual(decoding, expected_decoding)

            # batched
            words = [["hello", "world"], ["my", "name", "is", "niels"]]
            boxes = [[[1, 2, 3, 4], [5, 6, 7, 8]],
                     [[3, 2, 5, 1], [6, 7, 4, 2], [3, 9, 2, 4], [1, 1, 2, 3]]]
            input_processor = processor(images,
                                        words,
                                        boxes=boxes,
                                        padding=True,
                                        return_tensors="pt")

            # verify keys
            expected_keys = ["attention_mask", "bbox", "image", "input_ids"]
            actual_keys = sorted(list(input_processor.keys()))
            self.assertListEqual(actual_keys, expected_keys)

            # verify input_ids
            expected_decoding = "<s> hello world</s><pad><pad>"
            decoding = tokenizer.decode(input_processor.input_ids[0].tolist())
            self.assertSequenceEqual(decoding, expected_decoding)

            # verify bbox
            expected_bbox = [
                [0, 0, 0, 0],
                [3, 2, 5, 1],
                [6, 7, 4, 2],
                [3, 9, 2, 4],
                [1, 1, 2, 3],
                [1, 1, 2, 3],
                [1000, 1000, 1000, 1000],
            ]
            self.assertListEqual(input_processor.bbox[1].tolist(),
                                 expected_bbox)