def __init__(self, cursor_location_depends_on=None, options=None): super(SpouseExtractor, self).__init__(cursor_location_depends_on, options) self.PATTERN = r"vmo\.?(?P<spousedata>[A-ZÄ-Öa-zä-ö\s\.,\d-]*)(?=(Lapset|poika|tytär|asuinp|suvulla|tila))" self.NAMEPATTERN = r"(?P<name>^[\w\s-]*)" self.OPTIONS = (re.UNICODE | re.IGNORECASE) self.REQUIRES_MATCH_POSITION = False self.SUBSTRING_WIDTH = 100 self.kaira_id_provider = KairaIdProvider()
def __init__(self, cursor_location_depends_on=None, options=None): super(ChildExtractor, self).__init__(cursor_location_depends_on, options) self._kaira_id_provider = KairaIdProvider() self.CHILD_PATTERN = r"(?:Lapset|tytär|poika)(;|:)(?P<children>.*?)Asuinp{s<=1}" self.CHILD_OPTIONS = (re.UNICODE | re.IGNORECASE) self.SPLIT_PATTERN1 = r"(?P<child>[A-ZÄ-Öa-zä-ö\d\s-]{3,})" self.NAME_PATTERN = r"^(?P<name>[a-zä-ö\s-]+)" self.YEAR_PATTERN = r"(?P<year>(\d\d))" self.LOCATION_PATTERN = r"\d\d\s(?P<location>[a-zä-ö\s-]+$)" self.SPLIT_OPTIONS1 = (re.UNICODE | re.IGNORECASE)
class SpouseExtractor(BaseExtractor): extraction_key = KEYS["spouse"] def __init__(self, cursor_location_depends_on=None, options=None): super(SpouseExtractor, self).__init__(cursor_location_depends_on, options) self.PATTERN = r"vmo\.?(?P<spousedata>[A-ZÄ-Öa-zä-ö\s\.,\d-]*)(?=(Lapset|poika|tytär|asuinp|suvulla|tila))" self.NAMEPATTERN = r"(?P<name>^[\w\s-]*)" self.OPTIONS = (re.UNICODE | re.IGNORECASE) self.REQUIRES_MATCH_POSITION = False self.SUBSTRING_WIDTH = 100 self.kaira_id_provider = KairaIdProvider() def _extract(self, entry, extraction_results, extraction_metadata): start_position = self.get_starting_position(extraction_metadata) results = self._find_spouse(entry['text'], start_position) return self._add_to_extraction_results(results[0], extraction_results, extraction_metadata, cursor_location=results[1]) def _find_spouse(self, text, start_position): cursor_location = start_position spouse_data = None try: found_spouse_match = regex_utils.safe_search( self.PATTERN, text, self.OPTIONS) spouse_data = self._find_spouse_data( found_spouse_match.group("spousedata")) # Dirty fix for inaccuracy in positions which would screw the Location extraction cursor_location = found_spouse_match.end() + start_position - 4 except regex_utils.RegexNoneMatchException: pass return spouse_data, cursor_location def _find_spouse_data(self, text): try: name = regex_utils.safe_search(self.NAMEPATTERN, text, self.OPTIONS) spouse_name = name.group("name").strip() spouse_name = re.sub(r"\so$", "", spouse_name) spouse_details, metadata = self._find_spouse_details( text[name.end() - 2:]) # Map data to spouse object return { KEYS["spouseBirthData"]: { **spouse_details['birthday'] }, KEYS["formerSurname"]: spouse_details[KEYS['formerSurname']], KEYS["spouseName"]: spouse_name, KEYS["kairaId"]: self.kaira_id_provider.get_new_id('S') } except regex_utils.RegexNoneMatchException: self.metadata_collector.add_error_record('spouseNotFound', 7) def _find_spouse_details(self, text): return self._sub_extraction_pipeline.process({'text': text})
class ChildExtractor(BaseExtractor): geocoder = GeoCoder() extraction_key = 'children' def __init__(self, cursor_location_depends_on=None, options=None): super(ChildExtractor, self).__init__(cursor_location_depends_on, options) self._kaira_id_provider = KairaIdProvider() self.CHILD_PATTERN = r"(?:Lapset|tytär|poika)(;|:)(?P<children>.*?)Asuinp{s<=1}" self.CHILD_OPTIONS = (re.UNICODE | re.IGNORECASE) self.SPLIT_PATTERN1 = r"(?P<child>[A-ZÄ-Öa-zä-ö\d\s-]{3,})" self.NAME_PATTERN = r"^(?P<name>[a-zä-ö\s-]+)" self.YEAR_PATTERN = r"(?P<year>(\d\d))" self.LOCATION_PATTERN = r"\d\d\s(?P<location>[a-zä-ö\s-]+$)" self.SPLIT_OPTIONS1 = (re.UNICODE | re.IGNORECASE) def _extract(self, entry, extraction_results, extraction_metadata): children_results = self._find_children(entry['text']) return self._add_to_extraction_results(children_results[0], extraction_results, extraction_metadata, children_results[1]) def _postprocess(self, entry, extraction_results, extraction_metadata): """ Add location information to each child in this postprocess method. :param entry: :param extraction_results: :return extraction_results: """ self._get_output_path(extraction_results)[ self.extraction_key] = self._augment_location_data_of_children( self._get_output_path(extraction_results)[self.extraction_key]) return extraction_results, extraction_metadata def _augment_location_data_of_children(self, children): for child in children: location_entry = { KEYS['locationName']: child[KEYS["childLocationName"]], KEYS['region']: None, KEYS['latitude']: None, KEYS['longitude']: None } location_entry = place_name_cleaner.clean_place_name( location_entry) child[KEYS[ "childLocationName"]] = place_name_cleaner.normalize_place( location_entry) coordinates = self._find_birth_coord_and_region( child[KEYS["childLocationName"]][KEYS['locationName']]) child['location'] = {**location_entry, **coordinates} return children def _find_children(self, text): children = [] cursor_location = 0 try: found_children = regex_utils.safe_search(self.CHILD_PATTERN, text, self.CHILD_OPTIONS) cursor_location = found_children.end() children_str = found_children.group("children") children_str = self._clean_children(children_str) children = self._split_children(children_str) except regex_utils.RegexNoneMatchException: self.metadata_collector.add_error_record('childrenNotFound', 5) return children, cursor_location @staticmethod def _clean_children(children_str): children_str = children_str.strip(",") children_str = children_str.strip(".") children_str = children_str.strip() return children_str def _split_children(self, children_str): found_children = regex_utils.regex_iter(self.SPLIT_PATTERN1, children_str, self.SPLIT_OPTIONS1) children = [] for m in found_children: try: children.append(self._process_child(m.group("child"), children)) except (regex_utils.RegexNoneMatchException, StopChildExtractionException): pass return children def _process_child(self, child, child_list): birth_loc = regex_utils.search("syntyneet{s<=1}\s(?P<location>\w*)", child, self.CHILD_OPTIONS) if birth_loc is not None: # found a "Syntyneet <place>" string. Set it to the previous children. for c in child_list: if c[KEYS["childLocationName"]] == "": c[KEYS["childLocationName"]] = birth_loc.group("location") raise StopChildExtractionException( 'Child extraction should be stopped here. Current child is not valid child.' ) name = regex_utils.safe_search(self.NAME_PATTERN, child, self.CHILD_OPTIONS).group("name") name = name.strip() name = name.strip("-") name = name.strip(" ") try: gender = Sex.find_sex(name) except SexException: self.metadata_collector.add_error_record('genderNotFound', 2) gender = None try: year_match = regex_utils.safe_search(self.YEAR_PATTERN, child, self.CHILD_OPTIONS) year = year_match.group("year") if float(year) < 70: year = "19" + year else: year = "18" + year except regex_utils.RegexNoneMatchException: year = "" try: loc_match = regex_utils.safe_search(self.LOCATION_PATTERN, child, self.CHILD_OPTIONS) location = loc_match.group("location") location = location.strip() location = location.strip("-") except regex_utils.RegexNoneMatchException: location = "" return { KEYS["childName"]: name, KEYS["gender"]: gender, KEYS["birthYear"]: text_utils.int_or_none(year), KEYS["childLocationName"]: location, KEYS["kairaId"]: self._kaira_id_provider.get_new_id('C') } def _find_birth_coord_and_region(self, location_name): try: geocoordinates = self.geocoder.get_coordinates(location_name) except LocationNotFound as e: try: geocoordinates = self.geocoder.get_coordinates(location_name) except LocationNotFound as e: return self.geocoder.get_empty_coordinates() return geocoordinates
class SpouseExtractor(BaseExtractor): extraction_key = 'spouse' def __init__(self, cursor_location_depends_on=None, options=None): super(SpouseExtractor, self).__init__(cursor_location_depends_on, options) self.kaira_id_provider = KairaIdProvider() self.PATTERN = r'Puol\.?,?(?P<spousedata>[A-ZÄ-Öa-zä-ö\s\.,\d-]*)(?=(Lapset|poika|tytär|asuinp))' self.NAMEPATTERN = r'(?P<name>^[\w\s-]*)' self.OPTIONS = (re.UNICODE | re.IGNORECASE) self.REQUIRES_MATCH_POSITION = False self.SUBSTRING_WIDTH = 100 def _extract(self, entry, extraction_results, extraction_metadata): start_position = self.get_starting_position(extraction_metadata) result, cursor_location = self._find_spouse(entry, start_position) return self._add_to_extraction_results(result, extraction_results, extraction_metadata, cursor_location=cursor_location) def _find_spouse(self, entry, start_position): cursor_location = start_position spouse_data = None try: found_spouse_match = regex_utils.safe_search( self.PATTERN, entry['text'], self.OPTIONS) spouse_data = self._find_spouse_data( found_spouse_match.group('spousedata'), entry) # Dirty fix for inaccuracy in positions which would screw the Location extraction cursor_location = found_spouse_match.end() + start_position - 4 except regex_utils.RegexNoneMatchException: pass return spouse_data, cursor_location def _find_spouse_data(self, sub_text, entry): spouse_name = '' spouse_details = None try: spouse_name_match = regex_utils.safe_search( self.NAMEPATTERN, sub_text, self.OPTIONS) spouse_name = spouse_name_match.group('name').strip() spouse_name = re.sub(r'\so$', '', spouse_name) spouse_details, metadata = self._find_spouse_details( sub_text[spouse_name_match.end() - 2:], entry['full_text']) spouse_details = spouse_details['spouse'] return { **spouse_details, KEYS['spouseName']: spouse_name, KEYS['hasSpouse']: True, KEYS['kairaId']: self.kaira_id_provider.get_new_id('S'), } except regex_utils.RegexNoneMatchException: self.metadata_collector.add_error_record('spouseNotFound', 6) return spouse_name, spouse_details def _find_spouse_details(self, text, full_text): return self._sub_extraction_pipeline.process({ 'text': text, 'full_text': full_text })
class CommonChildExtractor(BaseExtractor): geocoder = GeoCoder() extraction_key = 'children' def __init__(self, cursor_location_depends_on=None, options=None): super(CommonChildExtractor, self).__init__(cursor_location_depends_on, options) self._kaira_id_provider = KairaIdProvider() def _extract(self, entry, extraction_results, extraction_metadata): start_position = self.get_starting_position(extraction_metadata) children_results = self._find_children(entry['text'], start_position) return self._add_to_extraction_results(children_results[0], extraction_results, extraction_metadata, children_results[1]) def _find_children(self, text, start_position): cursor_location = start_position text = re.sub(r"sekä", ",", text) children_entries = [] try: found_children_match = regex_utils.safe_search( self.CHILD_PATTERN, text, self.CHILD_OPTIONS) cursor_location = found_children_match.end() children_str = found_children_match.group("children") cleaned_children = self._clean_children(children_str) children_entries = self._split_children(cleaned_children) except regex_utils.RegexNoneMatchException: self.metadata_collector.add_error_record('childrenNotFound', 5) return children_entries, cursor_location @staticmethod def _clean_children(children_str): children_str = children_str.strip(",") children_str = children_str.strip(".") children_str = children_str.strip() return children_str def _split_children(self, children_str): found_children_matches = regex_utils.regex_iter( self.SPLIT_PATTERN1, children_str, self.SPLIT_OPTIONS1) children_entries = [] for m in found_children_matches: # check if there is "ja" word as separator such as "Seppo -41 ja Jaakko -32. ja_word = regex_utils.search(r"\sja\s", m.group("child")) if ja_word is not None: first_child = self._process_child( m.group("child")[0:ja_word.start()]) second_child = self._process_child( m.group("child")[ja_word.end():]) if first_child is not None and second_child is not None: self._twins_year_handler(first_child, second_child) children_entries.append(first_child) children_entries.append(second_child) elif first_child is not None: children_entries.append(first_child) elif second_child is not None: children_entries.append(second_child) else: child = self._process_child(m.group("child")) if child is not None: children_entries.append(child) return children_entries @staticmethod def _twins_year_handler(first, second): # if there is twins, the book doesn't explicitly define birthyear for first one. # therefore copy second child's value to first one if first is not None and second is not None: if first["birthYear"] is None and second["birthYear"] is not None: first["birthYear"] = second["birthYear"] def _process_child(self, child): try: name = regex_utils.safe_search(self.NAME_PATTERN, child, self.CHILD_OPTIONS).group("name") name = name.strip() name = name.strip("-") name = name.strip(" ") try: gender = Sex.find_sex(name) except SexException: self.metadata_collector.add_error_record('genderNotFound', 2) gender = None try: year_match = regex_utils.safe_search(self.YEAR_PATTERN, child, self.CHILD_OPTIONS) year = year_match.group("year") if float(year) < 70: year = text_utils.int_or_none("19" + year) else: year = text_utils.int_or_none("18" + year) except regex_utils.RegexNoneMatchException: year = None return { "name": name, "gender": gender, "birthYear": year, "kairaId": self._kaira_id_provider.get_new_id('C') } except regex_utils.RegexNoneMatchException: pass
def __init__(self, cursor_location_depends_on=None, options=None): super(CommonChildExtractor, self).__init__(cursor_location_depends_on, options) self._kaira_id_provider = KairaIdProvider()
def reset_kaira_id(): p = KairaIdProvider() p.reset()