示例#1
0
    def test_vdom_element_from_dict(self):
        vdom_element_json = """
    {
      "Id": 0,
      "text": "Director",
      "feature": [
        208.0,
        49.0,
        138.0,
        15.0,
        0.0,
        0.0,
        0.0,
        0.0,
        12.0,
        1.0,
        198.0,
        564.0,
        138.0,
        15.0,
        1.0,
        0.0,
        0.0,
        0.0,
        12.0,
        1.0
      ],
      "start_idx": 5,
      "end_idx": 6
    }
    """
        vdom_element = lib.VdomElement.from_dict(json.loads(vdom_element_json))

        expected = lib.VdomElement(id=0,
                                   text='Director',
                                   features=lib.VdomFeatures(x_coord=208.0,
                                                             width=49.0,
                                                             y_coord=138.0,
                                                             height=15.0,
                                                             is_block=False,
                                                             is_inline=False,
                                                             is_heading=False,
                                                             is_leaf=False,
                                                             font_size=12,
                                                             is_bold=True),
                                   parent_features=lib.VdomFeatures(
                                       x_coord=198.0,
                                       width=564.0,
                                       y_coord=138.0,
                                       height=15.0,
                                       is_block=True,
                                       is_inline=False,
                                       is_heading=False,
                                       is_leaf=False,
                                       font_size=12,
                                       is_bold=True),
                                   start_idx=5,
                                   end_idx=6)

        self.assertEqual(expected, vdom_element)
示例#2
0
    def test_etc_features_fixed_global_blocks(self):
        example = lib.OpenKpExample(
            url=
            'http://0123putlocker.com/watch/qd7kBodK-star-trek-discovery-season-1.html',
            text=
            'Star Trek Discovery Season 1 Jason Isaacs Jason Isaacs and Doug',
            vdom=[
                lib.VdomElement(id=0,
                                text='Star Trek Discovery Season 1 Jason',
                                features=lib.VdomFeatures(x_coord=44.0,
                                                          width=728.0,
                                                          y_coord=78.0,
                                                          height=45.0,
                                                          is_block=True,
                                                          is_inline=False,
                                                          is_heading=True,
                                                          is_leaf=False,
                                                          font_size=20,
                                                          is_bold=False),
                                parent_features=lib.VdomFeatures(
                                    x_coord=44.0,
                                    width=728.0,
                                    y_coord=78.0,
                                    height=45.0,
                                    is_block=True,
                                    is_inline=False,
                                    is_heading=True,
                                    is_leaf=False,
                                    font_size=20,
                                    is_bold=False),
                                start_idx=0,
                                end_idx=6),
                lib.VdomElement(id=0,
                                text='Isaacs Jason Isaacs and Doug',
                                features=lib.VdomFeatures(x_coord=208.0,
                                                          width=49.0,
                                                          y_coord=138.0,
                                                          height=15.0,
                                                          is_block=False,
                                                          is_inline=False,
                                                          is_heading=False,
                                                          is_leaf=False,
                                                          font_size=12,
                                                          is_bold=True),
                                parent_features=lib.VdomFeatures(
                                    x_coord=198.0,
                                    width=564.0,
                                    y_coord=138.0,
                                    height=15.0,
                                    is_block=True,
                                    is_inline=False,
                                    is_heading=False,
                                    is_leaf=False,
                                    font_size=12,
                                    is_bold=True),
                                start_idx=6,
                                end_idx=11)
            ],
            key_phrases=[
                lib.KeyPhrase(['Star', 'Trek']),
                lib.KeyPhrase(['Jason', 'Isaacs'])
            ])

        bert_vocab_path = os.path.join(absltest.get_default_test_srcdir(),
                                       VOCAB_PATH)
        config = lib.EtcFeaturizationConfig(long_max_length=16,
                                            global_max_length=4,
                                            url_max_code_points=80,
                                            bert_vocab_path=bert_vocab_path,
                                            do_lower_case=True,
                                            fixed_block_len=4)
        tokenizer = tokenization.FullTokenizer(
            config.bert_vocab_path, do_lower_case=config.do_lower_case)
        etc_features = example.to_etc_features(tokenizer, config)
        expected = lib.OpenKpEtcFeatures(
            url_code_points=[
                104, 116, 116, 112, 58, 47, 47, 48, 49, 50, 51, 112, 117, 116,
                108, 111, 99, 107, 101, 114, 46, 99, 111, 109, 47, 119, 97,
                116, 99, 104, 47, 113, 100, 55, 107, 66, 111, 100, 75, 45, 115,
                116, 97, 114, 45, 116, 114, 101, 107, 45, 100, 105, 115,
                99, 111, 118, 101, 114, 121, 45, 115, 101, 97, 115, 111, 110,
                45, 49, 46, 104, 116, 109, 108, -1, -1, -1, -1, -1, -1, -1
            ],
            label_start_idx=[5, 0, -1],
            label_phrase_len=[2, 2, -1],
            long_token_ids=[
                3, 4, 5, 6, 7, 8, 9, 10, 8, 9, 10, 11, 12, 0, 0, 0
            ],
            long_word_idx=[0, 1, 2, 3, 4, 5, 6, 6, 7, 8, 8, 9, 10, 0, 0, 0],
            long_vdom_idx=[0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 0, 0, 0],
            long_input_mask=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0],
            long_word_input_mask=[
                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0
            ],
            global_token_ids=[1, 1, 1, 1],
            global_input_mask=[1, 1, 1, 1],
            global_x_coords=[],
            global_y_coords=[],
            global_widths=[],
            global_heights=[],
            global_font_ids=[],
            global_block_indicator=[],
            global_inline_indicator=[],
            global_heading_indicator=[],
            global_leaf_indicator=[],
            global_bold_indicator=[],
            global_parent_x_coords=[],
            global_parent_y_coords=[],
            global_parent_widths=[],
            global_parent_heights=[],
            global_parent_font_ids=[],
            global_parent_heading_indicator=[],
            global_parent_leaf_indicator=[],
            global_parent_bold_indicator=[])
        self.assertEqual(expected, etc_features)
示例#3
0
    def test_etc_features_with_long_overflow(self):
        text = 'Star Wars and not Trek ' + ' '.join(['star'] * 12)
        vdom = [
            lib.VdomElement(id=0,
                            text='Star Wars and not Trek',
                            features=lib.VdomFeatures(x_coord=44.0,
                                                      width=728.0,
                                                      y_coord=78.0,
                                                      height=45.0,
                                                      is_block=True,
                                                      is_inline=False,
                                                      is_heading=True,
                                                      is_leaf=False,
                                                      font_size=20,
                                                      is_bold=False),
                            parent_features=lib.VdomFeatures(x_coord=44.0,
                                                             width=728.0,
                                                             y_coord=78.0,
                                                             height=45.0,
                                                             is_block=True,
                                                             is_inline=False,
                                                             is_heading=True,
                                                             is_leaf=False,
                                                             font_size=20,
                                                             is_bold=False),
                            start_idx=0,
                            end_idx=5),
            lib.VdomElement(id=0,
                            text=' '.join(['star'] * 99),
                            features=lib.VdomFeatures(x_coord=44.0,
                                                      width=728.0,
                                                      y_coord=78.0,
                                                      height=45.0,
                                                      is_block=True,
                                                      is_inline=False,
                                                      is_heading=True,
                                                      is_leaf=False,
                                                      font_size=20,
                                                      is_bold=False),
                            parent_features=lib.VdomFeatures(x_coord=44.0,
                                                             width=728.0,
                                                             y_coord=78.0,
                                                             height=45.0,
                                                             is_block=True,
                                                             is_inline=False,
                                                             is_heading=True,
                                                             is_leaf=False,
                                                             font_size=20,
                                                             is_bold=False),
                            start_idx=5,
                            end_idx=17)
        ]
        example = lib.OpenKpExample(
            url=
            'http://0123putlocker.com/watch/qd7kBodK-star-trek-discovery-season-1.html',
            text=text,
            vdom=vdom,
            key_phrases=[
                lib.KeyPhrase(['Star', 'Wars']),
                lib.KeyPhrase(['Trek']),
            ])
        bert_vocab_path = os.path.join(absltest.get_default_test_srcdir(),
                                       VOCAB_PATH)
        config = lib.EtcFeaturizationConfig(long_max_length=16,
                                            global_max_length=4,
                                            url_max_code_points=80,
                                            bert_vocab_path=bert_vocab_path,
                                            do_lower_case=True)
        tokenizer = tokenization.FullTokenizer(
            config.bert_vocab_path, do_lower_case=config.do_lower_case)
        etc_features = example.to_etc_features(tokenizer, config)
        expected = lib.OpenKpEtcFeatures(
            url_code_points=[
                104, 116, 116, 112, 58, 47, 47, 48, 49, 50, 51, 112, 117, 116,
                108, 111, 99, 107, 101, 114, 46, 99, 111, 109, 47, 119, 97,
                116, 99, 104, 47, 113, 100, 55, 107, 66, 111, 100, 75, 45, 115,
                116, 97, 114, 45, 116, 114, 101, 107, 45, 100, 105, 115,
                99, 111, 118, 101, 114, 121, 45, 115, 101, 97, 115, 111, 110,
                45, 49, 46, 104, 116, 109, 108, -1, -1, -1, -1, -1, -1, -1
            ],
            label_start_idx=[0, 4, -1],
            label_phrase_len=[2, 1, -1],
            long_token_ids=[3, 14, 11, 15, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
            long_word_idx=[0, 1, 2, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
            long_vdom_idx=[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
            long_input_mask=[1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
            long_word_input_mask=[
                1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
            ],
            long_word_first_occurrence=LONG_WORD_FIRST_OCCURRENCE3,
            global_token_ids=[1, 1, 1, 1],
            global_input_mask=[1, 0, 0, 0],
            global_x_coords=[44.0, 0, 0, 0],
            global_y_coords=[78.0, 0, 0, 0],
            global_widths=[728.0, 0, 0, 0],
            global_heights=[45.0, 0, 0, 0],
            global_font_ids=[13, 0, 0, 0],
            global_block_indicator=[1, 0, 0, 0],
            global_inline_indicator=[0, 0, 0, 0],
            global_heading_indicator=[1, 0, 0, 0],
            global_leaf_indicator=[0, 0, 0, 0],
            global_bold_indicator=[0, 0, 0, 0],
            global_parent_x_coords=[44.0, 0, 0, 0],
            global_parent_y_coords=[78.0, 0, 0, 0],
            global_parent_widths=[728.0, 0, 0, 0],
            global_parent_heights=[45.0, 0, 0, 0],
            global_parent_font_ids=[13, 0, 0, 0],
            global_parent_heading_indicator=[1, 0, 0, 0],
            global_parent_leaf_indicator=[0, 0, 0, 0],
            global_parent_bold_indicator=[0, 0, 0, 0])

        self.assertEqual(expected, etc_features)
示例#4
0
    def test_etc_features_with_vdom_overflow(self):
        vdom = [
            lib.VdomElement(id=0,
                            text='Star Trek Discovery Season 1 Jason',
                            features=lib.VdomFeatures(x_coord=44.0,
                                                      width=728.0,
                                                      y_coord=78.0,
                                                      height=45.0,
                                                      is_block=True,
                                                      is_inline=False,
                                                      is_heading=True,
                                                      is_leaf=False,
                                                      font_size=20,
                                                      is_bold=False),
                            parent_features=lib.VdomFeatures(x_coord=44.0,
                                                             width=728.0,
                                                             y_coord=78.0,
                                                             height=45.0,
                                                             is_block=True,
                                                             is_inline=False,
                                                             is_heading=True,
                                                             is_leaf=False,
                                                             font_size=20,
                                                             is_bold=False),
                            start_idx=0,
                            end_idx=5),
            lib.VdomElement(id=0,
                            text='Isaacs Jason Isaacs and Doug',
                            features=lib.VdomFeatures(x_coord=208.0,
                                                      width=49.0,
                                                      y_coord=138.0,
                                                      height=15.0,
                                                      is_block=False,
                                                      is_inline=False,
                                                      is_heading=False,
                                                      is_leaf=False,
                                                      font_size=12,
                                                      is_bold=True),
                            parent_features=lib.VdomFeatures(x_coord=198.0,
                                                             width=564.0,
                                                             y_coord=138.0,
                                                             height=15.0,
                                                             is_block=True,
                                                             is_inline=False,
                                                             is_heading=False,
                                                             is_leaf=False,
                                                             font_size=12,
                                                             is_bold=True),
                            start_idx=5,
                            end_idx=8)
        ]

        text = 'Star Trek Discovery Season 1 Director Jason Isaacs'
        text += ' foo' * (20 - 8)
        vdom.extend([
            lib.VdomElement(id=0,
                            text='foo',
                            features=lib.VdomFeatures(x_coord=208.0,
                                                      width=49.0,
                                                      y_coord=138.0,
                                                      height=15.0,
                                                      is_block=False,
                                                      is_inline=False,
                                                      is_heading=False,
                                                      is_leaf=True,
                                                      font_size=12,
                                                      is_bold=True),
                            parent_features=lib.VdomFeatures(x_coord=3110.0,
                                                             width=92.0,
                                                             y_coord=123.0,
                                                             height=75.0,
                                                             is_block=True,
                                                             is_inline=False,
                                                             is_heading=False,
                                                             is_leaf=True,
                                                             font_size=13,
                                                             is_bold=True),
                            start_idx=start_idx,
                            end_idx=start_idx + 1)
            for start_idx in range(8, 20)
        ])
        example = lib.OpenKpExample(
            url=
            'http://0123putlocker.com/watch/qd7kBodK-star-trek-discovery-season-1.html',
            text=text,
            vdom=vdom,
            key_phrases=[
                lib.KeyPhrase(['Star', 'Trek']),
                lib.KeyPhrase(['Jason', 'Isaacs']),
            ])
        bert_vocab_path = os.path.join(absltest.get_default_test_srcdir(),
                                       VOCAB_PATH)
        config = lib.EtcFeaturizationConfig(long_max_length=16,
                                            global_max_length=4,
                                            url_max_code_points=80,
                                            bert_vocab_path=bert_vocab_path,
                                            do_lower_case=True)
        tokenizer = tokenization.FullTokenizer(
            config.bert_vocab_path, do_lower_case=config.do_lower_case)
        etc_features = example.to_etc_features(tokenizer, config)
        expected = lib.OpenKpEtcFeatures(
            url_code_points=[
                104, 116, 116, 112, 58, 47, 47, 48, 49, 50, 51, 112, 117, 116,
                108, 111, 99, 107, 101, 114, 46, 99, 111, 109, 47, 119, 97,
                116, 99, 104, 47, 113, 100, 55, 107, 66, 111, 100, 75, 45, 115,
                116, 97, 114, 45, 116, 114, 101, 107, 45, 100, 105, 115,
                99, 111, 118, 101, 114, 121, 45, 115, 101, 97, 115, 111, 110,
                45, 49, 46, 104, 116, 109, 108, -1, -1, -1, -1, -1, -1, -1
            ],
            label_start_idx=[0, 7, -1],
            label_phrase_len=[2, 2, -1],
            long_token_ids=[
                3, 4, 5, 6, 7, 8, 9, 10, 8, 9, 10, 11, 12, 13, 13, 0
            ],
            long_word_idx=[0, 1, 2, 3, 4, 5, 6, 6, 7, 8, 8, 9, 10, 11, 12, 0],
            long_vdom_idx=[0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 2, 3, 0],
            long_input_mask=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0],
            long_word_input_mask=[
                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0
            ],
            long_word_first_occurrence=LONG_WORD_FIRST_OCCURRENCE2,
            global_token_ids=[1, 1, 1, 1],
            global_input_mask=[1, 1, 1, 1],
            global_x_coords=[44.0, 208.0, 208.0, 208.0],
            global_y_coords=[78.0, 138.0, 138.0, 138.0],
            global_widths=[728.0, 49.0, 49.0, 49.0],
            global_heights=[45.0, 15.0, 15.0, 15.0],
            global_font_ids=[13, 5, 5, 5],
            global_block_indicator=[1, 0, 0, 0],
            global_inline_indicator=[0, 0, 0, 0],
            global_heading_indicator=[1, 0, 0, 0],
            global_leaf_indicator=[0, 0, 1, 1],
            global_bold_indicator=[0, 1, 1, 1],
            global_parent_x_coords=[44.0, 198.0, 3110.0, 3110.0],
            global_parent_y_coords=[78.0, 138.0, 123.0, 123.0],
            global_parent_widths=[728.0, 564.0, 92.0, 92.0],
            global_parent_heights=[45.0, 15.0, 75.0, 75.0],
            global_parent_font_ids=[13, 5, 6, 6],
            global_parent_heading_indicator=[1, 0, 0, 0],
            global_parent_leaf_indicator=[0, 0, 1, 1],
            global_parent_bold_indicator=[0, 1, 1, 1])

        self.assertEqual(expected, etc_features)
示例#5
0
    def test_open_kp_example_from_json_unlabeled(self):
        example_json = r"""
    {
      "url": "http://0123putlocker.com/watch/qd7kBodK-star-trek-discovery-season-1.html",
      "text": "Star Trek Discovery Season 1 Director",
      "VDOM": "[{\"Id\":0,\"text\":\"Star Trek Discovery Season 1\",\"feature\":[44.0,728.0,78.0,45.0,1.0,0.0,1.0,0.0,20.0,0.0,44.0,728.0,78.0,45.0,1.0,0.0,1.0,0.0,20.0,0.0],\"start_idx\":0,\"end_idx\":5},{\"Id\":0,\"text\":\"Director\",\"feature\":[208.0,49.0,138.0,15.0,0.0,0.0,0.0,0.0,12.0,1.0,198.0,564.0,138.0,15.0,1.0,0.0,0.0,0.0,12.0,1.0],\"start_idx\":5,\"end_idx\":6}]"
    }
    """
        example = lib.OpenKpExample.from_json(example_json)

        expected = lib.OpenKpExample(
            url=
            'http://0123putlocker.com/watch/qd7kBodK-star-trek-discovery-season-1.html',
            text='Star Trek Discovery Season 1 Director',
            vdom=[
                lib.VdomElement(id=0,
                                text='Star Trek Discovery Season 1',
                                features=lib.VdomFeatures(x_coord=44.0,
                                                          width=728.0,
                                                          y_coord=78.0,
                                                          height=45.0,
                                                          is_block=True,
                                                          is_inline=False,
                                                          is_heading=True,
                                                          is_leaf=False,
                                                          font_size=20,
                                                          is_bold=False),
                                parent_features=lib.VdomFeatures(
                                    x_coord=44.0,
                                    width=728.0,
                                    y_coord=78.0,
                                    height=45.0,
                                    is_block=True,
                                    is_inline=False,
                                    is_heading=True,
                                    is_leaf=False,
                                    font_size=20,
                                    is_bold=False),
                                start_idx=0,
                                end_idx=5),
                lib.VdomElement(id=0,
                                text='Director',
                                features=lib.VdomFeatures(x_coord=208.0,
                                                          width=49.0,
                                                          y_coord=138.0,
                                                          height=15.0,
                                                          is_block=False,
                                                          is_inline=False,
                                                          is_heading=False,
                                                          is_leaf=False,
                                                          font_size=12,
                                                          is_bold=True),
                                parent_features=lib.VdomFeatures(
                                    x_coord=198.0,
                                    width=564.0,
                                    y_coord=138.0,
                                    height=15.0,
                                    is_block=True,
                                    is_inline=False,
                                    is_heading=False,
                                    is_leaf=False,
                                    font_size=12,
                                    is_bold=True),
                                start_idx=5,
                                end_idx=6)
            ],
            key_phrases=None)

        self.assertEqual(expected, example)