Exemplo n.º 1
0
    def test_iterate_alt_tag(self):
        """Test `_iterate_alt_tag` method outputs for an example tag in both
        scenarios."""
        alt_tag_text = (
            '<ALT><EM ID="142" CATEG="ACONTECIMENTO" TIPO="EVENTO">'
            'Ovarense-Amora</EM>|'
            '<EM ID="143" CATEG="PESSOA" TIPO="GRUPOMEMBRO">Ovarense</EM>'
            '-<EM ID="144" CATEG="PESSOA" TIPO="GRUPOMEMBRO">Amora</EM></ALT>')
        alt_tag = create_tag(alt_tag_text)

        with self.subTest("Test _iterate_alt_tag in Total scenario"):
            text, entities = XMLtoJSON(
                selective=False)._iterate_alt_tag(alt_tag)

            self.assertEqual(text, "Ovarense-Amora|Ovarense-Amora")

            entities_in_alt_tag = [{
                'entity_id': '142',
                'text': 'Ovarense-Amora',
                'start_offset': 0,
                'end_offset': len('Ovarense-Amora'),
                'label': 'ACONTECIMENTO'
            }, {
                'entity_id': '143',
                'text': 'Ovarense',
                'start_offset': len('Ovarense-Amora|'),
                'end_offset': len('Ovarense-Amora|Ovarense'),
                'label': 'PESSOA'
            }, {
                'entity_id':
                '144',
                'text':
                'Amora',
                'start_offset':
                len('Ovarense-Amora|Ovarense-'),
                'end_offset':
                len('Ovarense-Amora|Ovarense-Amora'),
                'label':
                'PESSOA'
            }]
            self.assertCountEqual(
                entities,
                entities_in_alt_tag,
            )

        with self.subTest("Test _iterate_alt_tag in Selective scenario"):
            text, entities = XMLtoJSON(
                selective=True)._iterate_alt_tag(alt_tag)

            self.assertEqual(text, "Ovarense-Amora|Ovarense-Amora")

            self.assertCountEqual(
                entities, entities_in_alt_tag[1:],
                "Selective scenario should ignore the first 'ACONTECIMENTO'"
                "entity.")
Exemplo n.º 2
0
    def test_get_label_vague_single_label_total_only(self, total_only_label_1,
                                                     total_only_label_2):
        tag_text = f'<EM ID="383" CATEG="{total_only_label_1}|{total_only_label_2}"></EM>'
        tag = create_tag(tag_text)

        label = XMLtoJSON(selective=True)._get_label(tag)
        self.assertIsNone(label, "Selective scenario should ignore the label.")

        label = XMLtoJSON(selective=False)._get_label(tag)
        self.assertEqual(label, total_only_label_1,
                         "Total scenario should read the first label.")
Exemplo n.º 3
0
    def test_get_label_selective_scenario(self, input_label):
        """Test that labels from selective scenario are always considered in
        both scenarios."""
        tag_text = f'<EM ID="383" CATEG="{input_label}"></EM>'
        tag = create_tag(tag_text)

        label = XMLtoJSON(selective=True)._get_label(tag)
        self.assertEqual(label, input_label,
                         "Selective scenario should read the label.")

        label = XMLtoJSON(selective=False)._get_label(tag)
        self.assertEqual(label, input_label,
                         "Total scenario should read the label.")
Exemplo n.º 4
0
    def test_get_label_vague_entity_valid_label(self, total_only_label,
                                                selective_label):
        """Given a vague entity that has two labels, where the first label is
        of total scenario and the second label is of selective scenario,
        assert that `_get_label` correctly picks the first label for total
        scenario and second label for selective scenario."""

        tag_text = f'<EM ID="383" CATEG="{total_only_label}|{selective_label}"></EM>'
        tag = create_tag(tag_text)

        label = XMLtoJSON(selective=True)._get_label(tag)
        self.assertEqual(label, selective_label,
                         "Selective scenario should ignore first label.")

        label = XMLtoJSON(selective=False)._get_label(tag)
        self.assertEqual(label, total_only_label,
                         "Total scenario should always return first label.")
Exemplo n.º 5
0
    def test_handle_alt_simple_case(self):
        """Test ALT tag handling for the two strategies when only one
        alternative has entities."""
        alt_tag = '<ALT>Nomes de Origem|<EM ID="2011" CATEG="ABSTRACCAO" TIPO="NOME">Nomes de Origem</EM></ALT>'
        alt_tag = create_tag(alt_tag)

        for alt_strat in ['most_entities', 'entity_coverage']:
            with self.subTest(
                    msg=f"Test Total scenario with strategy {alt_strat}"):
                converter = XMLtoJSON(selective=False, alt_strategy=alt_strat)

                chosen_text, chosen_entities = converter._handle_alt(alt_tag)
                self.assertEqual(chosen_text, "Nomes de Origem")
                self.assertListEqual(chosen_entities,
                                     [{
                                         'start_offset': 0,
                                         'end_offset': len('Nomes de Origem'),
                                         'entity_id': '2011',
                                         'label': 'ABSTRACCAO',
                                         'text': 'Nomes de Origem',
                                     }])

        for alt_strat in ['most_entities', 'entity_coverage']:
            with self.subTest(
                    msg=f"Test Selective scenario with strategy {alt_strat}"):
                converter = XMLtoJSON(selective=True, alt_strategy=alt_strat)

                chosen_text, chosen_entities = converter._handle_alt(alt_tag)
                self.assertEqual(chosen_text, "Nomes de Origem")
                self.assertListEqual(
                    chosen_entities, [],
                    "Selective scenario should not select any entity")
Exemplo n.º 6
0
    def test_text_agglutination_correction(self):
        """Test a scenario where an EM tag text would agglutinate with its
        preceding text in original HAREM, but not in generated JSON."""
        doc_excerpt = (
            '<DOC DOCID="part of HAREM-273-02298">'
            '<EM ID="203" CATEG="PESSOA" TIPO="INDIVIDUAL">Marco Bode</EM> fez '
            'o <EM ID="204" CATEG="VALOR" TIPO="CLASSIFICACAO">4-0</EM> aos'
            # Lack of space between these two lines
            '<EM ID="205" CATEG="VALOR" TIPO="QUANTIDADE">67\'</EM>, o '
            '<EM ID="206" CATEG="PESSOA" TIPO="GRUPOMEMBRO">Duisburg</EM> '
            'reduziu por <EM ID="207" CATEG="PESSOA" TIPO="INDIVIDUAL">'
            'Markkus Marin</EM> (<EM ID="208" CATEG="VALOR" TIPO="QUANTIDADE">'
            '78\'</EM>) e foi <EM ID="209" CATEG="PESSOA" TIPO="INDIVIDUAL">'
            'Andreas Herzog</EM> quem estabeleceu o resultado final, a sete '
            'minutos do fim.</DOC>')
        doc_tag = etree.fromstring(doc_excerpt)
        doc_dict = XMLtoJSON(selective=False).convert_document(doc_tag)

        doc_text = (
            "Marco Bode fez o 4-0 aos 67', o Duisburg reduziu por "
            "Markkus Marin (78') e foi Andreas Herzog quem estabeleceu o "
            "resultado final, a sete minutos do fim.")

        _Entity = namedtuple('_Entity', ['label', 'text'])
        expected_entities = [
            _Entity('PESSOA', 'Marco Bode'),
            _Entity('VALOR', '4-0'),
            _Entity('VALOR', "67'"),
            _Entity('PESSOA', 'Duisburg'),
            _Entity('PESSOA', 'Markkus Marin'),
            _Entity('VALOR', "78'"),
            _Entity('PESSOA', 'Andreas Herzog'),
        ]

        self.assertEqual(
            doc_dict['doc_text'], doc_text,
            "Doc text should not agglutinate words aos67' due to lack of space"
            "before <EM> tag.")

        for entity, expected_entity in zip(doc_dict['entities'],
                                           expected_entities):
            start, end = entity['start_offset'], entity['end_offset']
            self.assertEqual(
                entity['text'], doc_text[start:end],
                "Entity text should be equal to text slice using offsets")

            self.assertEqual(entity['text'], expected_entity.text,
                             "Entity texts should be equal to expected texts")

            self.assertEqual(entity['label'], expected_entity.label)
Exemplo n.º 7
0
    def test_convert_entity(self, entity_id, label, entity_text):
        """Tests the conversion of <EM/> tag to a dictionary."""
        entity_id = str(entity_id)
        tag_text = f'<EM ID="{entity_id}" CATEG="{label}">{entity_text}</EM>'
        em_tag = create_tag(tag_text)

        entity_dict = XMLtoJSON(selective=False)._convert_entity(em_tag)
        processed_text = html.unescape(entity_text.lstrip())

        self.assertDictEqual(
            entity_dict, {
                'entity_id': entity_id,
                'text': processed_text,
                'label': label,
                'start_offset': 0,
                'end_offset': len(processed_text),
            })
Exemplo n.º 8
0
    def test_complete_doc_conversion(self):

        doc_sample = (
            '<DOC DOCID="HAREM-554-05073">\n'
            'MONEY 1\n'
            'O escritor <EM ID="972" CATEG="PESSOA" TIPO="INDIVIDUAL">Clive Cussler</EM>, '
            'autor das aventuras de <EM ID="973" CATEG="PESSOA" TIPO="INDIVIDUAL">Dirk Pitt</EM>, '
            'assinou um contrato de <EM ID="974" CATEG="VALOR" TIPO="MOEDA">US$ 14 milhões</EM> '
            'com a <EM ID="975" CATEG="ORGANIZACAO" TIPO="EMPRESA">Simon &amp; Schuster</EM> '
            'para a publicação de dois livros.</DOC>')
        doc_tag = create_tag(doc_sample)

        doc_text = (
            '\nMONEY 1\nO escritor Clive Cussler, autor das aventuras de Dirk Pitt, '
            'assinou um contrato de US$ 14 milhões com a Simon & Schuster para a publicação '
            'de dois livros.')

        _Entity = namedtuple('_Entity', ['label', 'text'])
        expected_entities = [
            _Entity('PESSOA', 'Clive Cussler'),
            _Entity('PESSOA', 'Dirk Pitt'),
            _Entity('VALOR', 'US$ 14 milhões'),
            _Entity('ORGANIZACAO', 'Simon & Schuster'),
        ]

        doc_dict = XMLtoJSON(selective=False).convert_document(doc_tag)

        self.assertEqual(doc_dict['doc_id'], doc_tag.attrib['DOCID'])
        self.assertEqual(doc_dict['doc_text'], doc_text,
                         "Document text should be complete and unescaped.")

        with self.subTest('Test entities start and end offsets match the'
                          ' entity text.'):
            for entity, expected_entity in zip(doc_dict['entities'],
                                               expected_entities):
                start, end = entity['start_offset'], entity['end_offset']
                self.assertEqual(
                    doc_dict['doc_text'][start:end], entity['text'],
                    "Entity text should match doc_text slice using start and "
                    "end offsets")

                self.assertEqual(entity['text'], expected_entity.text)
                self.assertEqual(entity['label'], expected_entity.label)
Exemplo n.º 9
0
    def test_convertion_checks(self):
        """Convert the HAREM XML files and performs basic checks:
        1- All documents have texts.
        2- All returned entities have valid texts and offsets.
        
        HAREM files will be downloaded and saved in the cache directory.
        """
        files = {}

        for dataset in ('FirstHAREM', 'MiniHAREM'):
            xml_file = cached_download(download_urls[dataset],
                                       cache_dir=HAREM_DATA_DIR)
            files[dataset] = xml_file

            for scenario in ('selective', 'total'):
                xml_file.seek(0)
                converted = XMLtoJSON.convert_xml(
                    xml_file, selective=scenario == 'selective')

                self.assertEqual(len(converted),
                                 129 if dataset == 'FirstHAREM' else 128,
                                 "Assert converted document count is right.")

                for doc in converted:
                    doc_text = doc['doc_text']

                    self.assertGreater(len(doc_text), 0,
                                       'Text should not be empty')

                    for entity in doc['entities']:
                        start, end = entity['start_offset'], entity[
                            'end_offset']
                        self.assertTrue(0 <= start < end)
                        self.assertTrue(0 < end <= len(doc_text))
                        self.assertEqual(doc_text[start:end], entity['text'])

        for fp in files.values():
            fp.close()
Exemplo n.º 10
0
    def test_handle_alt_method(self):
        """Tests `_handle_alt` method for a real ALT tag. Asserts the extracted
        text and entities respect the alt_strategy and scenario."""
        alt_tag_text = (
            '<ALT><EM ID="142" CATEG="ACONTECIMENTO" TIPO="EVENTO">'
            'Ovarense-Amora</EM>|'
            '<EM ID="143" CATEG="PESSOA" TIPO="GRUPOMEMBRO">Ovarense</EM>'
            '-<EM ID="144" CATEG="PESSOA" TIPO="GRUPOMEMBRO">Amora</EM></ALT>')
        alt_tag = create_tag(alt_tag_text)

        first_alternative_ents = [{
            'entity_id': '142',
            'text': 'Ovarense-Amora',
            'start_offset': 0,
            'end_offset': len('Ovarense-Amora'),
            'label': 'ACONTECIMENTO'
        }]

        second_alternative_ents = [
            {
                'entity_id': '143',
                'text': 'Ovarense',
                'start_offset': 0,
                'end_offset': len('Ovarense'),
                'label': 'PESSOA'
            },
            {
                'entity_id': '144',
                'text': 'Amora',
                'start_offset': len('Ovarense-'),
                'end_offset': len('Ovarense-Amora'),
                'label': 'PESSOA'
            },
        ]

        with self.subTest("Test _handle_alt in Total scenario with "
                          "most_entities strategy"):
            converter = XMLtoJSON(selective=False,
                                  alt_strategy='most_entities')
            text, entities = converter._handle_alt(alt_tag)

            self.assertEqual(text, "Ovarense-Amora")
            self.assertCountEqual(
                entities,
                second_alternative_ents,
            )

        with self.subTest("Test _handle_alt in Total scenario with "
                          "entity_coverage strategy"):
            converter = XMLtoJSON(selective=False,
                                  alt_strategy='entity_coverage')
            text, entities = converter._handle_alt(alt_tag)

            self.assertEqual(text, "Ovarense-Amora")
            self.assertCountEqual(
                entities,
                first_alternative_ents,
            )

        # For selective scenario, only the second alternative has entities, so
        # it is always selected
        for alt_strategy in ('most_entities', 'entity_coverage'):
            with self.subTest("Test _handle_alt in Selective scenario with "
                              f"{alt_strategy} strategy"):
                converter = XMLtoJSON(selective=True,
                                      alt_strategy=alt_strategy)
                text, entities = converter._handle_alt(alt_tag)

                self.assertEqual(text, "Ovarense-Amora")
                self.assertCountEqual(
                    entities,
                    second_alternative_ents,
                )