Exemplo n.º 1
0
def test_header_parser():
    config_training = get_config_default()
    xml_path = config_training["xml_unittest_file"]
    header_content = parse_xml_header(path=xml_path)
    assert len(header_content) == 1
    assert header_content['CA-aix-en-provence-20130208-1022871-jurica'][
        'defendeur_fullname'] == ['Catherine ***REMOVED***']
Exemplo n.º 2
0
def test_match_headers_content():
    config_training = get_config_default()
    xml_path = config_training["xml_unittest_file"]
    header_content_all_cases = parse_xml_header(path=xml_path)
    case_id = list(header_content_all_cases.keys())[0]
    header_content = header_content_all_cases[case_id]
    headers_matcher = MatchValuesFromHeaders(current_header=header_content, threshold_size=3)
    matcher_partie_pp = headers_matcher.get_matcher_of_partie_pp_from_headers()

    text1 = "C'est Catherine ***REMOVED*** qui est responsable de ces faits avec M. LEON ***REMOVED***"

    assert matcher_partie_pp.get_matches(text1, "PERS") == [Offset(6, 29, "PERS")]
    def __init__(self):
        """
        Build a matcher of French court names based on a list available in open data
        https://www.data.gouv.fr/fr/datasets/les-statistiques-par-juridiction/#_
        (the list has more data, the one store is an extraction)
        """
        config = get_config_default()
        file = config["french_court_names"]

        with open(file) as f1:
            for line in f1.readlines():
                clean_text = line.strip()
                if len(clean_text) > 0:
                    self.court_names.add(clean_text)
        assert len(self.court_names) > 1000
        self.matcher = AcoraMatcher(content=list(self.court_names),
                                    ignore_case=True)
    def __init__(self):
        """
        Build a matcher of first name based on a French names dictionary
        """
        postal_code_city_list = list()
        config = get_config_default()
        file = config["postal_code_city"]

        with open(file) as f1:
            for line in f1.readlines():
                fields = line.split(";")
                city = fields[1].strip()
                if len(city) >= 3:
                    postal_code = fields[2].strip()
                    postal_code_city_list.append(postal_code + " " + city)
                    postal_code_city_list.append(city + " (" + postal_code +
                                                 ")")
        assert len(postal_code_city_list) > 1000
        postal_code_city_list.pop(0)
        self.matcher = AcoraMatcher(list(postal_code_city_list),
                                    ignore_case=True)
Exemplo n.º 5
0
    def __init__(self, ignore_case: bool):
        """
        Build a matcher of first name based on a French names dictionary
        :type ignore_case: True to ignore case during matching
        :return: Acora matcher
        """
        config = get_config_default()

        file1 = config["first_name_dict_1"]
        file2 = config["first_name_dict_2"]

        firs_name = set()
        with open(file1) as f1:
            for line in f1.readlines():
                fields = line.split(";")
                # all names start with a Upcase letter and finishes with a space
                text = fields[3].strip()
                if len(text) >= 4:
                    firs_name.add(text)

        with open(file2, encoding="ISO-8859-1") as f2:
            for line in f2.readlines():
                fields = line.split(";")
                text = fields[0].strip()
                if len(text) >= 4:
                    firs_name.add(get_title_case(text))

        to_remove = [
            "Elle", "France", "Mercedes", "Paris", "Alger", "Oran", "Sans"
        ]

        for item_to_remove in to_remove:
            firs_name.remove(item_to_remove)

        self.first_name_dict = firs_name
        self.matcher = AcoraMatcher(content=list(self.first_name_dict),
                                    ignore_case=ignore_case)
Exemplo n.º 6
0
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing,
#  software distributed under the License is distributed on an
#  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
#  KIND, either express or implied.  See the License for the
#  specific language governing permissions and limitations
#  under the License.
import sys
sys.path.append('../xml_extractions')
sys.path.append('../resources')
from config_provider import get_config_default
from extract_node_values import get_paragraph_with_entities, read_xml, get_paragraph_from_file

config_training = get_config_default()
xml_path = config_training["xml_unittest_file"]


def test_xml_parser():
    tree = read_xml(xml_path)
    r = tree.xpath('//TexteJuri/P')

    assert len(r) == 27

    for i in r:
        paragraph_text, extracted_text, offsets = get_paragraph_with_entities(
            i)
        if len(extracted_text) > 0:
            item_text = extracted_text[0]
            current_attribute = offsets[0]