def _process_child(self, child): try: name = regex_utils.safe_search(self.NAME_PATTERN, child, self.CHILD_OPTIONS).group("name") name = name.strip() name = name.strip("-") name = name.strip(" ") try: gender = Sex.find_sex(name) except SexException: self.metadata_collector.add_error_record('genderNotFound', 2) gender = None try: year_match = regex_utils.safe_search(self.YEAR_PATTERN, child, self.CHILD_OPTIONS) year = year_match.group("year") if float(year) < 70: year = text_utils.int_or_none("19" + year) else: year = text_utils.int_or_none("18" + year) except regex_utils.RegexNoneMatchException: year = None return { "name": name, "gender": gender, "birthYear": year, "kairaId": self._kaira_id_provider.get_new_id('C') } except regex_utils.RegexNoneMatchException: pass
def _find_profession(self, text, start_position): text = text_utils.take_sub_str_based_on_range(text, start_position, self.SEARCH_SPACE) cursor_location = 0 profession = None try: # limit the search range if there is spouse keyword: try: found_spouse_word = regexUtils.safe_search( r"Puol", text, self.PROFESSION_OPTIONS) text = text_utils.take_sub_str_based_on_range( text, 0, found_spouse_word.start()) except regexUtils.RegexNoneMatchException as e: pass found_profession_match = regexUtils.safe_search( self.PROFESSION_PATTERN, text, self.PROFESSION_OPTIONS) cursor_location = found_profession_match.end() profession = found_profession_match.group("profession") except regexUtils.RegexNoneMatchException as e: pass result_profession = self._clean_professions(profession) if result_profession is None: self.metadata_collector.add_error_record('professionNotFound', 4) return result_profession, cursor_location
def _find_omakotitalo(self, text): try: regexUtils.safe_search(self.OMAKOTITALO_PATTERN, text, self.OMAKOTITALO_OPTIONS) return True except regexUtils.RegexNoneMatchException: pass return False
def _find_patterns(self, text): results = {} for key, pattern in self.patterns_to_find.items(): try: regexUtils.safe_search(pattern, text, self.OPTIONS) results[key] = True except regexUtils.RegexNoneMatchException: results[key] = False pass return results
def _process_child(self, child, child_list): birth_loc = regex_utils.search("syntyneet{s<=1}\s(?P<location>\w*)", child, self.CHILD_OPTIONS) if birth_loc is not None: # found a "Syntyneet <place>" string. Set it to the previous children. for c in child_list: if c[KEYS["childLocationName"]] == "": c[KEYS["childLocationName"]] = birth_loc.group("location") raise StopChildExtractionException( 'Child extraction should be stopped here. Current child is not valid child.' ) name = regex_utils.safe_search(self.NAME_PATTERN, child, self.CHILD_OPTIONS).group("name") name = name.strip() name = name.strip("-") name = name.strip(" ") try: gender = Sex.find_sex(name) except SexException: self.metadata_collector.add_error_record('genderNotFound', 2) gender = None try: year_match = regex_utils.safe_search(self.YEAR_PATTERN, child, self.CHILD_OPTIONS) year = year_match.group("year") if float(year) < 70: year = "19" + year else: year = "18" + year except regex_utils.RegexNoneMatchException: year = "" try: loc_match = regex_utils.safe_search(self.LOCATION_PATTERN, child, self.CHILD_OPTIONS) location = loc_match.group("location") location = location.strip() location = location.strip("-") except regex_utils.RegexNoneMatchException: location = "" return { KEYS["childName"]: name, KEYS["gender"]: gender, KEYS["birthYear"]: text_utils.int_or_none(year), KEYS["childLocationName"]: location, KEYS["kairaId"]: self._kaira_id_provider.get_new_id('C') }
def _find_patterns(self, text): results = {} for key, pattern in self.patterns_to_find.items(): try: usepattern = self.QUANTITY_PATTERN + pattern found = regexUtils.safe_search(usepattern, text, self.OPTIONS) results[key] = self._process_value(found) except regexUtils.RegexNoneMatchException: try: usepattern = pattern + self.QUANTITY_PATTERN found = regexUtils.safe_search(usepattern, text, self.OPTIONS) results[key] = self._process_value(found) except regexUtils.RegexNoneMatchException: results[key] = None return results
def _find_owner_name_details(self, text, start_position): cursor_location = start_position owner_name_data = ('', '', '') try: owner_name_match = regexUtils.safe_search(self.OWNER_NAME_PATTERN, text, self.OWNER_OPTIONS) cursor_location = start_position + owner_name_match.end() owner_name_data = self._split_names(owner_name_match.group("name")) except regexUtils.RegexNoneMatchException: self.metadata_collector.add_error_record('ownerNameNotFound', 7) return owner_name_data, cursor_location
def _find_owner_year(self, text, start_position): cursor_location = start_position owner_year = None try: owner_year = regexUtils.safe_search(self.OWNER_YEAR_PATTERN, text, self.OWNER_OPTIONS) cursor_location = start_position + owner_year.end() owner_year = text_utils.int_or_none(owner_year.group("year")) except regexUtils.RegexNoneMatchException: self.metadata_collector.add_error_record('ownerYearNotFound', 2) return owner_year, cursor_location
def _find_date(self, text, start_position): cursor_location = start_position try: wedding = regex_utils.safe_search(self.PATTERN, text, self.OPTIONS) # Dirty fix for inaccuracy in positions which would screw the Location extraction cursor_location = wedding.end() + start_position - 4 wedding_year = text_utils.int_or_none("19" + wedding.group("year")) except regex_utils.RegexNoneMatchException: wedding_year = None return wedding_year, cursor_location
def _find_family(self, text, start_position): text = text_utils.take_sub_str_based_on_pos(text, start_position, self.SEARCH_SPACE) cursor_location = 0 own_family = None try: found_family_match = regexUtils.safe_search(self.FAMILY_PATTERN, text, self.FAMILY_OPTIONS) cursor_location = found_family_match.end() own_family = found_family_match.group("family") except regexUtils.RegexNoneMatchException as e: pass return own_family, cursor_location
def find_location(self, text): """ Note: Returns match-object for caller instead of string. :param text: :return: """ try: found_location_match = regex_utils.safe_search( self.PATTERN, text, self.OPTIONS) cursor_location = found_location_match.end() return found_location_match, cursor_location except regex_utils.RegexNoneMatchException: raise LocationException(text)
def _get_area(self, text, pattern): area = None try: found_area = regexUtils.safe_search(pattern, text, self.AREA_OPTIONS) if found_area.group("area1") is not None: area = found_area.group("area1") elif found_area.group("area2") is not None: area = found_area.group("area2") except regexUtils.RegexNoneMatchException: pass return area
def _find_location_match(self, text): """ Note: Returns match-object for caller instead of string. :param text: :return: """ pattern = r'(?:\d+| s)(?:\s|,|\.)(?P<location>[A-ZÄ-Ö]{1,1}[A-ZÄ-Öa-zä-ö-]{1,}(?: mlk)?)' try: found_location_match = regex_utils.safe_search( pattern, text, re.UNICODE) cursor_location = found_location_match.end() return found_location_match, cursor_location except regex_utils.RegexNoneMatchException: raise LocationException(text)
def _find_date(self, text): try: found_date_matches = regex_utils.safe_search(self.PATTERN, text, self.OPTIONS) months_and_years_from_words = self._if_written_month_names_extract_them(found_date_matches) cursor_location = found_date_matches.end() if months_and_years_from_words is None: year = self._get_year_from_match(found_date_matches) day_and_month = self._get_month_and_day_from_match(found_date_matches) return {'day': day_and_month['day'], 'month': day_and_month['month'], 'year': year}, cursor_location else: return {'day': '', 'month': months_and_years_from_words[0], 'year': months_and_years_from_words[1]}, cursor_location except regex_utils.RegexNoneMatchException: raise DateException(text)
def _find_children(self, text): children = [] cursor_location = 0 try: found_children = regex_utils.safe_search(self.CHILD_PATTERN, text, self.CHILD_OPTIONS) cursor_location = found_children.end() children_str = found_children.group("children") children_str = self._clean_children(children_str) children = self._split_children(children_str) except regex_utils.RegexNoneMatchException: self.metadata_collector.add_error_record('childrenNotFound', 5) return children, cursor_location
def _get_area(self, text, pattern): area = None cursor_location = 0 try: found_area_match = regexUtils.safe_search(pattern, text, self.AREA_OPTIONS) cursor_location = found_area_match.end() if found_area_match.group("area1") is not None: area = found_area_match.group("area1") elif found_area_match.group("area2") is not None: area = found_area_match.group("area2") except regexUtils.RegexNoneMatchException: pass return area, cursor_location
def _find_spouse(self, text, start_position): cursor_location = start_position spouse_data = None try: found_spouse_match = regex_utils.safe_search( self.PATTERN, text, self.OPTIONS) spouse_data = self._find_spouse_data( found_spouse_match.group("spousedata")) # Dirty fix for inaccuracy in positions which would screw the Location extraction cursor_location = found_spouse_match.end() + start_position - 4 except regex_utils.RegexNoneMatchException: pass return spouse_data, cursor_location
def _find_children(self, text, start_position): cursor_location = start_position text = re.sub(r"sekä", ",", text) children_entries = [] try: found_children_match = regex_utils.safe_search( self.CHILD_PATTERN, text, self.CHILD_OPTIONS) cursor_location = found_children_match.end() children_str = found_children_match.group("children") cleaned_children = self._clean_children(children_str) children_entries = self._split_children(cleaned_children) except regex_utils.RegexNoneMatchException: self.metadata_collector.add_error_record('childrenNotFound', 5) return children_entries, cursor_location
def _find_spouse_data(self, text): try: name = regex_utils.safe_search(self.NAMEPATTERN, text, self.OPTIONS) spouse_name = name.group("name").strip() spouse_name = re.sub(r"\so$", "", spouse_name) spouse_details, metadata = self._find_spouse_details( text[name.end() - 2:]) # Map data to spouse object return { KEYS["spouseBirthData"]: { **spouse_details['birthday'] }, KEYS["formerSurname"]: spouse_details[KEYS['formerSurname']], KEYS["spouseName"]: spouse_name, KEYS["kairaId"]: self.kaira_id_provider.get_new_id('S') } except regex_utils.RegexNoneMatchException: self.metadata_collector.add_error_record('spouseNotFound', 7)
def _find_spouse_data(self, sub_text, entry): spouse_name = '' spouse_details = None try: spouse_name_match = regex_utils.safe_search( self.NAMEPATTERN, sub_text, self.OPTIONS) spouse_name = spouse_name_match.group('name').strip() spouse_name = re.sub(r'\so$', '', spouse_name) spouse_details, metadata = self._find_spouse_details( sub_text[spouse_name_match.end() - 2:], entry['full_text']) spouse_details = spouse_details['spouse'] return { **spouse_details, KEYS['spouseName']: spouse_name, KEYS['hasSpouse']: True, KEYS['kairaId']: self.kaira_id_provider.get_new_id('S'), } except regex_utils.RegexNoneMatchException: self.metadata_collector.add_error_record('spouseNotFound', 6) return spouse_name, spouse_details
def _find_locations(self, text): # Replace all weird invisible white space characters with regular space text = re.sub(r'\s', r' ', text) cursor_location = 0 location_entries = [] def _get_location_entries(parsed_location): village_information = None location_records = [] # Parsed result set may countain municipality and village information. If only one result is in the # result set, interpret it as municipality if 'municipality' in parsed_location: # Try to normalize place names first so that the coordinate fetch from DB might work better entry_name, entry_region = place_name_cleaner.try_to_normalize_place_name_with_known_aliases( parsed_location['municipality'], return_region=True) village_information = self._get_village(parsed_location) else: entry_name, entry_region = place_name_cleaner.try_to_normalize_place_name_with_known_aliases( parsed_location['place'], return_region=True) geocoordinates = get_coordinates_by_name(entry_name) entry_name = validate_location_name(entry_name, geocoordinates) # If region was in db associated to coordinates, override previously set region with it if 'region' in geocoordinates: entry_region = geocoordinates['region'] if 'year_information' in parsed_location: for migration in parsed_location['year_information']: if 'moved_in' in migration: moved_in = text_utils.int_or_none( migration['moved_in']) else: moved_in = None if 'moved_out' in migration: moved_out = text_utils.int_or_none( migration['moved_out']) else: moved_out = None location_records.append( # FIXME: Refactor this to the _postprocess method? place_name_cleaner.clean_place_name( self._get_location_entry(entry_name, entry_region, geocoordinates, village_information, moved_in, moved_out))) else: location_records.append( # FIXME: Refactor this to the _postprocess method? place_name_cleaner.clean_place_name( self._get_location_entry(entry_name, entry_region, geocoordinates, village_information))) return location_records try: found_locations = regex_utils.safe_search(self.LOCATION_PATTERN, text, self.LOCATION_OPTIONS) cursor_location = found_locations.end() locations = found_locations.group('asuinpaikat') locations = clean_locations(locations) # Parse location string with BNF parser parsed_locations = migration_parser.parse_locations(locations) try: for loc in parsed_locations: location_entries += _get_location_entries(loc) except InvalidLocationException: pass except regex_utils.RegexNoneMatchException: self.metadata_collector.add_error_record('otherLocationNotFound', 5) return location_entries, cursor_location
def _find_hostess_name(self, text): hostess_name_match = regexUtils.safe_search(self.HOSTESS_NAME_PATTERN, text, self.HOSTESS_OPTIONS) cursor_location = hostess_name_match.end() hostess_name = self._split_names(hostess_name_match.group("name")) return hostess_name, cursor_location