def __init__(self, cursor_location_depends_on=None, options=None):
        super(SpouseExtractor, self).__init__(cursor_location_depends_on,
                                              options)
        self.PATTERN = r"vmo\.?(?P<spousedata>[A-ZÄ-Öa-zä-ö\s\.,\d-]*)(?=(Lapset|poika|tytär|asuinp|suvulla|tila))"
        self.NAMEPATTERN = r"(?P<name>^[\w\s-]*)"
        self.OPTIONS = (re.UNICODE | re.IGNORECASE)
        self.REQUIRES_MATCH_POSITION = False
        self.SUBSTRING_WIDTH = 100

        self.kaira_id_provider = KairaIdProvider()
    def __init__(self, cursor_location_depends_on=None, options=None):
        super(ChildExtractor, self).__init__(cursor_location_depends_on,
                                             options)
        self._kaira_id_provider = KairaIdProvider()
        self.CHILD_PATTERN = r"(?:Lapset|tytär|poika)(;|:)(?P<children>.*?)Asuinp{s<=1}"
        self.CHILD_OPTIONS = (re.UNICODE | re.IGNORECASE)

        self.SPLIT_PATTERN1 = r"(?P<child>[A-ZÄ-Öa-zä-ö\d\s-]{3,})"
        self.NAME_PATTERN = r"^(?P<name>[a-zä-ö\s-]+)"
        self.YEAR_PATTERN = r"(?P<year>(\d\d))"
        self.LOCATION_PATTERN = r"\d\d\s(?P<location>[a-zä-ö\s-]+$)"
        self.SPLIT_OPTIONS1 = (re.UNICODE | re.IGNORECASE)
class SpouseExtractor(BaseExtractor):
    extraction_key = KEYS["spouse"]

    def __init__(self, cursor_location_depends_on=None, options=None):
        super(SpouseExtractor, self).__init__(cursor_location_depends_on,
                                              options)
        self.PATTERN = r"vmo\.?(?P<spousedata>[A-ZÄ-Öa-zä-ö\s\.,\d-]*)(?=(Lapset|poika|tytär|asuinp|suvulla|tila))"
        self.NAMEPATTERN = r"(?P<name>^[\w\s-]*)"
        self.OPTIONS = (re.UNICODE | re.IGNORECASE)
        self.REQUIRES_MATCH_POSITION = False
        self.SUBSTRING_WIDTH = 100

        self.kaira_id_provider = KairaIdProvider()

    def _extract(self, entry, extraction_results, extraction_metadata):
        start_position = self.get_starting_position(extraction_metadata)
        results = self._find_spouse(entry['text'], start_position)
        return self._add_to_extraction_results(results[0],
                                               extraction_results,
                                               extraction_metadata,
                                               cursor_location=results[1])

    def _find_spouse(self, text, start_position):
        cursor_location = start_position
        spouse_data = None

        try:
            found_spouse_match = regex_utils.safe_search(
                self.PATTERN, text, self.OPTIONS)
            spouse_data = self._find_spouse_data(
                found_spouse_match.group("spousedata"))

            # Dirty fix for inaccuracy in positions which would screw the Location extraction
            cursor_location = found_spouse_match.end() + start_position - 4
        except regex_utils.RegexNoneMatchException:
            pass

        return spouse_data, cursor_location

    def _find_spouse_data(self, text):
        try:
            name = regex_utils.safe_search(self.NAMEPATTERN, text,
                                           self.OPTIONS)
            spouse_name = name.group("name").strip()
            spouse_name = re.sub(r"\so$", "", spouse_name)
            spouse_details, metadata = self._find_spouse_details(
                text[name.end() - 2:])

            # Map data to spouse object
            return {
                KEYS["spouseBirthData"]: {
                    **spouse_details['birthday']
                },
                KEYS["formerSurname"]: spouse_details[KEYS['formerSurname']],
                KEYS["spouseName"]: spouse_name,
                KEYS["kairaId"]: self.kaira_id_provider.get_new_id('S')
            }

        except regex_utils.RegexNoneMatchException:
            self.metadata_collector.add_error_record('spouseNotFound', 7)

    def _find_spouse_details(self, text):
        return self._sub_extraction_pipeline.process({'text': text})
class ChildExtractor(BaseExtractor):
    geocoder = GeoCoder()
    extraction_key = 'children'

    def __init__(self, cursor_location_depends_on=None, options=None):
        super(ChildExtractor, self).__init__(cursor_location_depends_on,
                                             options)
        self._kaira_id_provider = KairaIdProvider()
        self.CHILD_PATTERN = r"(?:Lapset|tytär|poika)(;|:)(?P<children>.*?)Asuinp{s<=1}"
        self.CHILD_OPTIONS = (re.UNICODE | re.IGNORECASE)

        self.SPLIT_PATTERN1 = r"(?P<child>[A-ZÄ-Öa-zä-ö\d\s-]{3,})"
        self.NAME_PATTERN = r"^(?P<name>[a-zä-ö\s-]+)"
        self.YEAR_PATTERN = r"(?P<year>(\d\d))"
        self.LOCATION_PATTERN = r"\d\d\s(?P<location>[a-zä-ö\s-]+$)"
        self.SPLIT_OPTIONS1 = (re.UNICODE | re.IGNORECASE)

    def _extract(self, entry, extraction_results, extraction_metadata):
        children_results = self._find_children(entry['text'])

        return self._add_to_extraction_results(children_results[0],
                                               extraction_results,
                                               extraction_metadata,
                                               children_results[1])

    def _postprocess(self, entry, extraction_results, extraction_metadata):
        """
        Add location information to each child in this postprocess method.
        :param entry: 
        :param extraction_results: 
        :return extraction_results: 
        """
        self._get_output_path(extraction_results)[
            self.extraction_key] = self._augment_location_data_of_children(
                self._get_output_path(extraction_results)[self.extraction_key])
        return extraction_results, extraction_metadata

    def _augment_location_data_of_children(self, children):
        for child in children:
            location_entry = {
                KEYS['locationName']: child[KEYS["childLocationName"]],
                KEYS['region']: None,
                KEYS['latitude']: None,
                KEYS['longitude']: None
            }

            location_entry = place_name_cleaner.clean_place_name(
                location_entry)
            child[KEYS[
                "childLocationName"]] = place_name_cleaner.normalize_place(
                    location_entry)

            coordinates = self._find_birth_coord_and_region(
                child[KEYS["childLocationName"]][KEYS['locationName']])
            child['location'] = {**location_entry, **coordinates}

        return children

    def _find_children(self, text):
        children = []
        cursor_location = 0
        try:
            found_children = regex_utils.safe_search(self.CHILD_PATTERN, text,
                                                     self.CHILD_OPTIONS)
            cursor_location = found_children.end()
            children_str = found_children.group("children")
            children_str = self._clean_children(children_str)
            children = self._split_children(children_str)

        except regex_utils.RegexNoneMatchException:
            self.metadata_collector.add_error_record('childrenNotFound', 5)

        return children, cursor_location

    @staticmethod
    def _clean_children(children_str):
        children_str = children_str.strip(",")
        children_str = children_str.strip(".")
        children_str = children_str.strip()
        return children_str

    def _split_children(self, children_str):
        found_children = regex_utils.regex_iter(self.SPLIT_PATTERN1,
                                                children_str,
                                                self.SPLIT_OPTIONS1)
        children = []
        for m in found_children:
            try:
                children.append(self._process_child(m.group("child"),
                                                    children))
            except (regex_utils.RegexNoneMatchException,
                    StopChildExtractionException):
                pass

        return children

    def _process_child(self, child, child_list):
        birth_loc = regex_utils.search("syntyneet{s<=1}\s(?P<location>\w*)",
                                       child, self.CHILD_OPTIONS)
        if birth_loc is not None:
            # found a "Syntyneet <place>" string. Set it to the previous children.
            for c in child_list:
                if c[KEYS["childLocationName"]] == "":
                    c[KEYS["childLocationName"]] = birth_loc.group("location")
            raise StopChildExtractionException(
                'Child extraction should be stopped here. Current child is not valid child.'
            )

        name = regex_utils.safe_search(self.NAME_PATTERN, child,
                                       self.CHILD_OPTIONS).group("name")
        name = name.strip()
        name = name.strip("-")
        name = name.strip(" ")

        try:
            gender = Sex.find_sex(name)
        except SexException:
            self.metadata_collector.add_error_record('genderNotFound', 2)
            gender = None

        try:
            year_match = regex_utils.safe_search(self.YEAR_PATTERN, child,
                                                 self.CHILD_OPTIONS)
            year = year_match.group("year")
            if float(year) < 70:
                year = "19" + year
            else:
                year = "18" + year
        except regex_utils.RegexNoneMatchException:
            year = ""

        try:
            loc_match = regex_utils.safe_search(self.LOCATION_PATTERN, child,
                                                self.CHILD_OPTIONS)
            location = loc_match.group("location")
            location = location.strip()
            location = location.strip("-")
        except regex_utils.RegexNoneMatchException:
            location = ""

        return {
            KEYS["childName"]: name,
            KEYS["gender"]: gender,
            KEYS["birthYear"]: text_utils.int_or_none(year),
            KEYS["childLocationName"]: location,
            KEYS["kairaId"]: self._kaira_id_provider.get_new_id('C')
        }

    def _find_birth_coord_and_region(self, location_name):
        try:
            geocoordinates = self.geocoder.get_coordinates(location_name)
        except LocationNotFound as e:
            try:
                geocoordinates = self.geocoder.get_coordinates(location_name)
            except LocationNotFound as e:
                return self.geocoder.get_empty_coordinates()
        return geocoordinates
class SpouseExtractor(BaseExtractor):
    extraction_key = 'spouse'

    def __init__(self, cursor_location_depends_on=None, options=None):
        super(SpouseExtractor, self).__init__(cursor_location_depends_on,
                                              options)
        self.kaira_id_provider = KairaIdProvider()

        self.PATTERN = r'Puol\.?,?(?P<spousedata>[A-ZÄ-Öa-zä-ö\s\.,\d-]*)(?=(Lapset|poika|tytär|asuinp))'
        self.NAMEPATTERN = r'(?P<name>^[\w\s-]*)'
        self.OPTIONS = (re.UNICODE | re.IGNORECASE)
        self.REQUIRES_MATCH_POSITION = False
        self.SUBSTRING_WIDTH = 100

    def _extract(self, entry, extraction_results, extraction_metadata):
        start_position = self.get_starting_position(extraction_metadata)
        result, cursor_location = self._find_spouse(entry, start_position)

        return self._add_to_extraction_results(result,
                                               extraction_results,
                                               extraction_metadata,
                                               cursor_location=cursor_location)

    def _find_spouse(self, entry, start_position):
        cursor_location = start_position
        spouse_data = None

        try:
            found_spouse_match = regex_utils.safe_search(
                self.PATTERN, entry['text'], self.OPTIONS)
            spouse_data = self._find_spouse_data(
                found_spouse_match.group('spousedata'), entry)

            # Dirty fix for inaccuracy in positions which would screw the Location extraction
            cursor_location = found_spouse_match.end() + start_position - 4
        except regex_utils.RegexNoneMatchException:
            pass

        return spouse_data, cursor_location

    def _find_spouse_data(self, sub_text, entry):
        spouse_name = ''
        spouse_details = None

        try:
            spouse_name_match = regex_utils.safe_search(
                self.NAMEPATTERN, sub_text, self.OPTIONS)
            spouse_name = spouse_name_match.group('name').strip()
            spouse_name = re.sub(r'\so$', '', spouse_name)
            spouse_details, metadata = self._find_spouse_details(
                sub_text[spouse_name_match.end() - 2:], entry['full_text'])
            spouse_details = spouse_details['spouse']

            return {
                **spouse_details,
                KEYS['spouseName']: spouse_name,
                KEYS['hasSpouse']: True,
                KEYS['kairaId']: self.kaira_id_provider.get_new_id('S'),
            }
        except regex_utils.RegexNoneMatchException:
            self.metadata_collector.add_error_record('spouseNotFound', 6)

        return spouse_name, spouse_details

    def _find_spouse_details(self, text, full_text):
        return self._sub_extraction_pipeline.process({
            'text': text,
            'full_text': full_text
        })
示例#6
0
class CommonChildExtractor(BaseExtractor):
    geocoder = GeoCoder()
    extraction_key = 'children'

    def __init__(self, cursor_location_depends_on=None, options=None):
        super(CommonChildExtractor, self).__init__(cursor_location_depends_on,
                                                   options)
        self._kaira_id_provider = KairaIdProvider()

    def _extract(self, entry, extraction_results, extraction_metadata):
        start_position = self.get_starting_position(extraction_metadata)

        children_results = self._find_children(entry['text'], start_position)
        return self._add_to_extraction_results(children_results[0],
                                               extraction_results,
                                               extraction_metadata,
                                               children_results[1])

    def _find_children(self, text, start_position):
        cursor_location = start_position
        text = re.sub(r"sekä", ",", text)
        children_entries = []

        try:
            found_children_match = regex_utils.safe_search(
                self.CHILD_PATTERN, text, self.CHILD_OPTIONS)
            cursor_location = found_children_match.end()
            children_str = found_children_match.group("children")
            cleaned_children = self._clean_children(children_str)
            children_entries = self._split_children(cleaned_children)

        except regex_utils.RegexNoneMatchException:
            self.metadata_collector.add_error_record('childrenNotFound', 5)

        return children_entries, cursor_location

    @staticmethod
    def _clean_children(children_str):
        children_str = children_str.strip(",")
        children_str = children_str.strip(".")
        children_str = children_str.strip()
        return children_str

    def _split_children(self, children_str):
        found_children_matches = regex_utils.regex_iter(
            self.SPLIT_PATTERN1, children_str, self.SPLIT_OPTIONS1)
        children_entries = []
        for m in found_children_matches:
            # check if there is "ja" word as separator such as "Seppo -41 ja Jaakko -32.
            ja_word = regex_utils.search(r"\sja\s", m.group("child"))
            if ja_word is not None:
                first_child = self._process_child(
                    m.group("child")[0:ja_word.start()])
                second_child = self._process_child(
                    m.group("child")[ja_word.end():])

                if first_child is not None and second_child is not None:
                    self._twins_year_handler(first_child, second_child)
                    children_entries.append(first_child)
                    children_entries.append(second_child)
                elif first_child is not None:
                    children_entries.append(first_child)
                elif second_child is not None:
                    children_entries.append(second_child)

            else:
                child = self._process_child(m.group("child"))
                if child is not None:
                    children_entries.append(child)

        return children_entries

    @staticmethod
    def _twins_year_handler(first, second):
        # if there is twins, the book doesn't explicitly define birthyear for first one.
        # therefore copy second child's value to first one
        if first is not None and second is not None:
            if first["birthYear"] is None and second["birthYear"] is not None:
                first["birthYear"] = second["birthYear"]

    def _process_child(self, child):
        try:
            name = regex_utils.safe_search(self.NAME_PATTERN, child,
                                           self.CHILD_OPTIONS).group("name")
            name = name.strip()
            name = name.strip("-")
            name = name.strip(" ")
            try:
                gender = Sex.find_sex(name)
            except SexException:
                self.metadata_collector.add_error_record('genderNotFound', 2)
                gender = None

            try:
                year_match = regex_utils.safe_search(self.YEAR_PATTERN, child,
                                                     self.CHILD_OPTIONS)
                year = year_match.group("year")
                if float(year) < 70:
                    year = text_utils.int_or_none("19" + year)
                else:
                    year = text_utils.int_or_none("18" + year)
            except regex_utils.RegexNoneMatchException:
                year = None

            return {
                "name": name,
                "gender": gender,
                "birthYear": year,
                "kairaId": self._kaira_id_provider.get_new_id('C')
            }
        except regex_utils.RegexNoneMatchException:
            pass
示例#7
0
 def __init__(self, cursor_location_depends_on=None, options=None):
     super(CommonChildExtractor, self).__init__(cursor_location_depends_on,
                                                options)
     self._kaira_id_provider = KairaIdProvider()
示例#8
0
def reset_kaira_id():
    p = KairaIdProvider()
    p.reset()