def test_header_parser(): config_training = get_config_default() xml_path = config_training["xml_unittest_file"] header_content = parse_xml_header(path=xml_path) assert len(header_content) == 1 assert header_content['CA-aix-en-provence-20130208-1022871-jurica'][ 'defendeur_fullname'] == ['Catherine ***REMOVED***']
def test_match_headers_content(): config_training = get_config_default() xml_path = config_training["xml_unittest_file"] header_content_all_cases = parse_xml_header(path=xml_path) case_id = list(header_content_all_cases.keys())[0] header_content = header_content_all_cases[case_id] headers_matcher = MatchValuesFromHeaders(current_header=header_content, threshold_size=3) matcher_partie_pp = headers_matcher.get_matcher_of_partie_pp_from_headers() text1 = "C'est Catherine ***REMOVED*** qui est responsable de ces faits avec M. LEON ***REMOVED***" assert matcher_partie_pp.get_matches(text1, "PERS") == [(6, 29, "PERS")]
def __init__(self): """ Build a matcher of French court names based on a list available in open data https://www.data.gouv.fr/fr/datasets/les-statistiques-par-juridiction/#_ (the list has more data, the one store is an extraction) """ config = get_config_default() file = config["french_court_names"] with open(file) as f1: for line in f1.readlines(): clean_text = line.strip() if len(clean_text) > 0: self.court_names.add(clean_text) assert len(self.court_names) > 1000 self.matcher = AcoraMatcher(content=list(self.court_names), ignore_case=True)
def __init__(self): """ Build a matcher of first name based on a French names dictionary """ postal_code_city_list = list() config = get_config_default() file = config["postal_code_city"] with open(file) as f1: for line in f1.readlines(): fields = line.split(";") city = fields[1].strip() if len(city) >= 3: postal_code = fields[2].strip() postal_code_city_list.append(postal_code + " " + city) postal_code_city_list.append(city + " (" + postal_code + ")") assert len(postal_code_city_list) > 1000 postal_code_city_list.pop(0) self.matcher = AcoraMatcher(list(postal_code_city_list), ignore_case=True)
def __init__(self, ignore_case: bool): """ Build a matcher of first name based on a French names dictionary :type ignore_case: True to ignore case during matching :return: Acora matcher """ config = get_config_default() file1 = config["first_name_dict_1"] file2 = config["first_name_dict_2"] firs_name = set() with open(file1) as f1: for line in f1.readlines(): fields = line.split(";") # all names start with a Upcase letter and finishes with a space text = fields[3].strip() if len(text) >= 4: firs_name.add(text) with open(file2, encoding="ISO-8859-1") as f2: for line in f2.readlines(): fields = line.split(";") text = fields[0].strip() if len(text) >= 4: firs_name.add(get_title_case(text)) to_remove = [ "Elle", "France", "Mercedes", "Paris", "Alger", "Oran", "Sans" ] for item_to_remove in to_remove: firs_name.remove(item_to_remove) self.first_name_dict = firs_name self.matcher = AcoraMatcher(content=list(self.first_name_dict), ignore_case=ignore_case)
from match_text_unsafe.extend_names import ExtendNames from match_text_unsafe.find_header_values import parse_xml_headers from match_text_unsafe.postal_code_dictionary_matcher import PostalCodeCity from misc.normalize_offset import normalize_offsets, remove_spaces_included_in_offsets, \ clean_offsets_from_unwanted_words from modify_text.change_case import random_case_change from modify_text.modify_strings import remove_key_words from ner.training_function import train_model from resources.config_provider import get_config_default from viewer.spacy_viewer import convert_offsets_to_spacy_docs, view_spacy_docs from xml_extractions.extract_node_values import get_paragraph_from_file, Paragraph, Offset # reproducibility seed(123) config_training = get_config_default() xml_train_path = config_training["xml_train_path"] model_dir_path = config_training["model_dir_path"] n_iter = int(config_training["number_iterations"]) batch_size = int(config_training["batch_size"]) dropout_rate = float(config_training["dropout_rate"]) training_set_export_path = config_training["training_set"] change_case_rate = int(config_training["change_case_rate"]) remove_keyword_rate = int(config_training["remove_keyword_rate"]) frequent_entity_threshold = int(config_training["frequent_entity_threshold"]) number_of_paragraph_to_display = int( config_training["number_of_paragraph_to_display"]) print(len(sys.argv)) assert len(sys.argv) <= 2