def test_get_name_from_complete_name(self):
     '''
     Test get name from complete name
     '''
     name1 = "José Martínez  Pérez "
     name, surname, _ = get_name_surname_from_complete_name(name1, convention="spanish_surname")
     assert(name == "José")
     assert(surname == "Martínez Pérez")
     
     name2 = "John Smith"
     name, surname, _ = get_name_surname_from_complete_name(name2)
     assert(name == "John")
     assert(surname == "Smith")
     
     name, surname, _ = get_name_surname_from_complete_name(name2, convention="wrong_convention")
     assert(name == None)
     assert(surname == None)
     
     #This one makes sure that having a spanish naming convention with only one surname will work.
     name3 = "Benito Molpecérez"
     name, surname, _ = get_name_surname_from_complete_name(name3, convention = "spanish_surname")
     assert(name == "Benito")
     assert(surname == "Molpecérez")
     
     name4 = "Valentín Lupicino"
     name, surname, _ = get_name_surname_from_complete_name(name4, convention = "spanish_surname", language="es")
     assert(name == name4)
     assert(surname == "")
     
     name5 = "Valentin Lupicino Martin"
     name, surname, _ = get_name_surname_from_complete_name(name5, convention = "spanish_surname", language="es")
     assert(name == "Valentín Lupicino")
     assert(surname == "Martín")
     
     name6 = "Pedro"
     name, surname, _ = get_name_surname_from_complete_name(name6, convention = "spanish_surname", language="es")
     assert(name == "Pedro")
     assert(surname == "")
     
     name7 = "Hijinia"
     name, surname, _ = get_name_surname_from_complete_name(name7, convention = "spanish_surname", language="es")
     assert(name == "Higinia")
     assert(surname == "")
     #Bug with particle San in Spanish
     name8 = "Michaela San Miguel"
     name, surname, _ = get_name_surname_from_complete_name(name8, convention = "spanish_surname", language="es")
     assert(name == "Micaela")
     assert(surname == "San Miguel")
     
     #Deal with unkonwn surnames
     name9 = "Pedro María Ortega Cynara"
     name, surname, _ = get_name_surname_from_complete_name(name9, convention = "spanish_surname", language="es")
     assert(name == "Pedro María")
     assert(surname == "Ortega Cynara")
예제 #2
0
 def handle_data(self, data):
     if self.inname:
         self.inname = False
         name, surname, _ = get_name_surname_from_complete_name(
             data, convention="spanish_surname", language="es")
         profile = gen_profile(name, surname)
         self.records.append(profile)
예제 #3
0
 def handle_endtag(self, tag):
     if tag == 'div' and self.inside_results:
         #We closed one of the tags
         if (self.internal_div > 0):
             self.internal_div -= 1
             if (self.internal_div == 0):
                 self.inside_results = False
     if self.inside_results:
         if (tag == "li"):
             name, surname, _ = get_name_surname_from_complete_name(self.name, convention="spanish_surname")
             prof_record = gen_profile(name, surname)
             prof_record.setWebReference(self.web_link)
             prof_record.setCheckedDate("death", self.death_date.year, self.death_date.month,self.death_date.day,"EXACT")
             prof_record.setComments(self.comments)
             self.records.append(prof_record)
예제 #4
0
 def handle_data(self, data):
     if self.inside_description:
         self.inside_description = False
         result_AGE = re.search('a los (.*) años', data.lower())
         result_LOC = re.search('fallecido en (.*) el día', data.lower())
         if result_AGE:
             self.age_here = True
             self.age = result_AGE.group(1)
         if result_LOC:
             self.location_here = True
             self.location = result_LOC.group(1)
         self.comment = data
     if self.inside_profile:
         self.inside_profile = False
         self.name = data.replace(" : Fallecimiento", "").replace(
             "Don ",
             "").replace("Doña ",
                         "").replace("D. ",
                                     "").replace("DON ",
                                                 "").replace("DOÑA ", "")
     if self.ending_citation and self.inside_citation:
         name, surname, _ = get_name_surname_from_complete_name(
             self.name, convention="spanish_surname")
         profile = gen_profile(name, surname)
         profile.setCheckedGender(self.sex)
         profile.setWebReference(self.weblink)
         profile.setComments(self.comment)
         profile.setCheckedDateWithDates("death", self.death_date, "EXACT")
         if (self.age_here):
             self.age_here = False
             #Just in case we have not extracted the right age of the person
             if (self.age.isdigit()):
                 profile.setCheckedDate("birth",
                                        self.death_date.year -
                                        int(self.age),
                                        accuracy="ABOUT")
         if (self.location_here):
             self.location_here = False
             profile.setPlaces("death", self.location, language="es")
         self.records.append(profile)
         #We just mark the end of the profile extraction
         self.ending_citation = False
         self.inside_citation = False
 def handle_endtag(self, tag):
     if self.inside_description:
         name, surname, _ = get_name_surname_from_complete_name(
             self.name, convention="spanish_surname")
         profile = gen_profile(name, surname)
         profile.setWebReference(self.weblink)
         profile.setCheckedDate("death", self.death_date.year,
                                self.death_date.month, self.death_date.day,
                                "EXACT")
         #To avoid mistakes in the parsing or issues in registers
         if (self.age) and (self.age > 0) and (self.age < MAXIMUM_LIFESPAN):
             profile.setCheckedDate("birth",
                                    self.death_date.year - self.age,
                                    accuracy="ABOUT")
         if (self.location):
             profile.setPlaces("death", self.location, language="es")
         self.inside_description = False
         self.initiate_person_data()
         self.profiles.append(profile)
예제 #6
0
 def handle_starttag(self, tag, attrs):
     if tag == "a":
         for attr in attrs:
             if attr[0] == "class" and attr[1] == "notice_name_link":
                 self.inside_profile = True
             elif attr[0] == "href":
                 self.web_link = "https://enmemoria.lavanguardia.com" + attr[1]
             elif attr[0] == "title":
                 name_data = attr[1].replace("Fallecimiento", "").replace(":","").strip()
                 name, surname, _ = get_name_surname_from_complete_name(name_data, convention="spanish_surname")
                 self.name = name
                 self.surname = surname
     elif tag == "p" and self.inside_profile: self.inside_data = True
     elif tag == "meta" and self.inside_profile:
         itemprop = None
         content = None
         for attr in attrs:
             if attr[0] == "itemprop": itemprop = attr[1]
             elif attr[0] == "content": content = attr[1]
         if itemprop == "datePublished":
             self.death_date = datetime.strptime(content.strip(),"%Y-%m-%d" )
 def __get_profiles__(self):
     '''
     This function will take all different profiles included inside the excel file
     '''
     current_sheet = self.loaded_data[self.sheet_title]
     #Iterator of missing inptus
     number_missing = 0
     #The id number to be used
     id_profiles = 0
     #Temporal variable checking the correct reading
     correct_introduction = True
     #Intermediate variables for potential parent surnames in the input file
     potential_father_surname = []
     potential_father_surname_repetitions = []
     potential_mother_surname = []
     potential_mother_surname_repetitions = []
     #Intermediate variables for potential parent names in the input file
     potential_father_name = []
     potential_father_name_repetitions = []
     potential_mother_name = []
     potential_mother_name_repetitions = []
     #We firstly detect the surnames of the parents of the profile,we cannot avoid the double
     #iteration
     for row in range(self.initial_row+1, self.loaded_data[self.sheet_title].max_row+1):
         for column_index in range(column_index_from_string(self.initial_column),self.loaded_data[self.sheet_title].max_column):
             column_criteria = current_sheet.cell(row=self.initial_row, column=column_index).value
             cell_value = current_sheet.cell(row=row, column=column_index).value
             if (column_criteria in ["father_full_name", "mother_full_name"]  ):
                 #If the cell_value is null we shall avoid continuing
                 if (cell_value != None):
                     name_data = get_name_surname_from_complete_name(cell_value, convention=self.naming_convention, language=self.language)
                     #We have two surnames or one?
                     surname_cand = name_data[1]
                     if (name_data[2] == 2):
                         surname_cand = name_data[1].split()[0]
                     if(column_criteria == "father_full_name"):
                         if (not surname_cand in potential_father_surname):
                             potential_father_surname.append(surname_cand)
                             potential_father_surname_repetitions.append(1)
                         else:
                             index = potential_father_surname.index(surname_cand)
                             potential_father_surname_repetitions[index] = potential_father_surname_repetitions[index] + 1
                         if (not name_data[0] in potential_father_name):
                             potential_father_name.append(name_data[0])
                             potential_father_name_repetitions.append(1)
                         else:
                             index = potential_father_name.index(name_data[0])
                             potential_father_name_repetitions[index] = potential_father_name_repetitions[index] + 1
                     elif(column_criteria == "mother_full_name"):
                         if (not surname_cand in potential_mother_surname):
                             potential_mother_surname.append(surname_cand)
                             potential_mother_surname_repetitions.append(1)
                         else:
                             index = potential_mother_surname.index(surname_cand)
                             potential_mother_surname_repetitions[index] = potential_mother_surname_repetitions[index] + 1
                         if (not name_data[0] in potential_mother_name):
                             potential_mother_name.append(name_data[0])
                             potential_mother_name_repetitions.append(1)
                         else:
                             index = potential_mother_name.index(name_data[0])
                             potential_mother_name_repetitions[index] = potential_mother_name_repetitions[index] + 1
     index_father_surname = potential_father_surname_repetitions.index(max(potential_father_surname_repetitions))
     index_mother_surname = potential_mother_surname_repetitions.index(max(potential_mother_surname_repetitions))
     father_surname = potential_father_surname[index_father_surname]
     mother_surname = potential_mother_surname[index_mother_surname]
     index_father_name = potential_father_name_repetitions.index(max(potential_father_name_repetitions))
     index_mother_name = potential_mother_name_repetitions.index(max(potential_mother_name_repetitions))
     father_name = potential_father_name[index_father_name]
     mother_name = potential_mother_name[index_mother_name]
     self.father_profile = gen_profile(father_name, father_surname)
     self.mother_profile = gen_profile(mother_name, mother_surname)
     children_surname = get_children_surname(father_surname, mother_surname, self.naming_convention)
     #Now we read the complete file
     for row in range(self.initial_row+1, self.loaded_data[self.sheet_title].max_row+1):
         included_profile = gen_profile("TBD", children_surname)
         included_right = True
         for column_index in range(column_index_from_string(self.initial_column),self.loaded_data[self.sheet_title].max_column):
             column_criteria = current_sheet.cell(row=self.initial_row, column=column_index).value
             cell_value = current_sheet.cell(row=row, column=column_index).value
             #We are ignoring all those cells that are empty.
             if ( cell_value ):
                 this_introduction = True
                 #Ok, now we go one by one each of the different values
                 if(column_criteria == "gender"):
                     this_introduction = included_profile.setCheckedGender(cell_value)
                 elif (column_criteria in LOCATION_EQUIVALENCE.keys()):
                     included_profile.setPlaces(LOCATION_EQUIVALENCE[column_criteria], cell_value, self.language)
                 elif (column_criteria == "person_url"):
                     included_profile.setWebReference("https://familysearch.org/" +cell_value)
                 elif (column_criteria in date_fields.keys()):
                     #Notice that we shall detect if the given date is a year or a specific date
                     #we will make the different using "about" and using datetime in the background
                     if(is_year(cell_value)):
                         this_introduction = self.__include_a_date__(column_criteria, included_profile, datetime.strptime(str(cell_value.replace(" ", "")), "%Y").date(), "ABOUT")
                     else:
                         this_introduction = self.__include_a_date__(column_criteria, included_profile, datetime.strptime(cell_value, "%d %b %Y").date(), "EXACT")
                 elif(column_criteria == "full_name"):
                     included_profile.set_name(get_name_from_fullname(cell_value,potential_father_surname, potential_mother_surname, language=self.language))
                     #In the case the name if not the same, we create it as nickname
                     if (cell_value != included_profile.returnFullName()): included_profile.add_nickname(cell_value)
                 elif (column_criteria == "spouse_full_name"):
                     #Here we create a new profile using the surname of the guy
                     names = get_name_surname_from_complete_name(cell_value, convention=self.naming_convention, language=self.language)
                     partner = gen_profile(names[0], names[1])
                     partner.set_id(id_profiles)
                     #If the surname is changed we shall include the previous surname in the nicknames
                     if (cell_value != partner.returnFullName()): partner.add_nickname(cell_value)
                     #Now we link the profiles
                     included_profile.set_marriage_id_link(id_profiles)
                     self.related_profiles[id_profiles] = partner
                 elif (column_criteria == "other_full_names"):
                     #The separator provided by family search is semicolumn
                     parents = cell_value.split(";")
                     #We obtain firstly the different names
                     father_name, father_surname, _ = get_name_surname_from_complete_name(parents[0], convention=self.naming_convention, language=self.language)
                     if (len(parents) == 2):
                         mother_name, mother_surname, _ = get_name_surname_from_complete_name(parents[1], convention=self.naming_convention, language=self.language)
                     #The algorithm provides an empty surname, we fill it with not known
                     if (father_surname == ""): father_surname = NOT_KNOWN_VALUE
                     if (mother_surname == ""): mother_surname = NOT_KNOWN_VALUE
                     #Create the standard profiles
                     father = gen_profile(father_name, father_surname)
                     mother = gen_profile(mother_name, mother_surname)
                     #If the surname is changed we shall include the previous surname in the nicknames
                     if (parents[0] != father.returnFullName()): father.add_nickname(parents[0])
                     if (len(parents) == 2) and (parents[1] != mother.returnFullName()): mother.add_nickname(parents[1])
                     #add gender
                     father.setCheckedGender("M")
                     mother.setCheckedGender("F")
                     self.parents_profiles[id_profiles] = [father, mother]
                 elif (column_criteria in ignored_fields):
                     pass
                 else:
                     number_missing = number_missing + 1
                     logging.warning(COLUMN_NOT_FOUND + column_criteria)
                 if (not this_introduction): included_right = False
             #This is a way to later on identify the link between the profiles
         id_profiles += 1
         if(not included_right) : correct_introduction = False
         self.profiles.append(included_profile)
     #Now we know the data we fix some with the proper logic
     for profile_obtained in self.profiles:
         #If the baptism and birth are close enough we assign the birth place to the baptism place
         birth_d = profile_obtained.gen_data.get("birth_date", None)
         bapt_d = profile_obtained.gen_data.get("baptism_date", None)
         if birth_d and bapt_d:
             difference = bapt_d - birth_d
             if abs(difference.days) < DIFFERNCE_BIRTH_BAPT:
                 place_birth = profile_obtained.gen_data.get("birth_place", {}).get("raw", None)
                 place_baptism = profile_obtained.gen_data.get("baptism_place", {}).get("raw", None)
                 if place_baptism and not place_birth:
                     profile_obtained.setPlaces("birth_place",get_location_standard(profile_obtained.gen_data["baptism_place"]), self.language)
         if profile_obtained.gen_data.get("marriage_link", None) in self.related_profiles.keys():
             id_of_marriage = profile_obtained.gen_data["marriage_link"]
             partner = self.related_profiles[id_of_marriage]
             partner.setWebReference(profile_obtained.gen_data["web_ref"])
             #It is a partner so we add as opposite sex!
             partner.setCheckedGender(get_partner_gender(profile_obtained.gen_data["gender"]))
             partner.setCheckedDate("marriage_date", profile_obtained.gen_data["marriage_date"], profile_obtained.gen_data["accuracy_marriage_date"]  )
             partner.setPlaces("marriage_place", profile_obtained.gen_data["marriage_place"]["raw"], language=self.language )
             if id_of_marriage in self.parents_profiles.keys():
                 father = self.parents_profiles[id_of_marriage][0]
                 mother = self.parents_profiles[id_of_marriage][1]
                 father.setWebReference(profile_obtained.gen_data["web_ref"])
                 mother.setWebReference(profile_obtained.gen_data["web_ref"])
                 surnames = get_splitted_name_from_complete_name(partner.gen_data["surname"], language=self.language)[0]
                 if (father.gen_data["surname"] == NOT_KNOWN_VALUE):
                     #It might be the case that the surname is empty
                     #Ok the data was not including the right data, but we know the surname
                     if (self.naming_convention == "spanish_surname" and len(surnames) != 0):
                         father.gen_data["surname"] = surnames[0]
                     else:
                         father.gen_data["surname"] = partner.gen_data["surname"]
                 if (mother.gen_data["surname"] == NOT_KNOWN_VALUE) and (self.naming_convention == "spanish_surname") and (len(surnames) == 2):
                     mother.gen_data["surname"] = surnames[1]
                 if (self.naming_convention == "spanish_surname"):
                     #We need to ensure 2 surnames in spanish naming conventions
                     if not (mother.gen_data["surname"] in partner.gen_data["surname"]) or (len(partner.gen_data["surname"].split()) == 1):
                         #In the case we have 2 surnames, we try to eliminate the second surnames.
                         partner_surname_data = get_splitted_name_from_complete_name(partner.gen_data["surname"], language=self.language)
                         mother_surname_data = get_splitted_name_from_complete_name(mother.gen_data["surname"], language=self.language)
                         if len(partner.gen_data["nicknames"]) == 0: partner.add_nickname(partner.returnFullName())
                         partner.gen_data["surname"] = " ".join([partner_surname_data[0][0], mother_surname_data[0][0]])
     #Finally, let's merge those profiles that are the same!
     indexes_to_remove = []
     iterating_list = list(self.profiles)
     for i in range(len(iterating_list)):
         #We are going one by one all the different profiles
         if not i in indexes_to_remove:
             for j, other_prof in enumerate(iterating_list[i+1:]):
                 merged = self.profiles[i].merge_profile(other_prof, language=self.language, convention=self.naming_convention)
                 if merged:
                     indexes_to_remove.append(i+j+1)
     new_values = list(set(indexes_to_remove))
     new_values.sort()
     for deletion in reversed(new_values):
         del self.profiles[deletion]
     return correct_introduction
예제 #8
0
all_events.append(birth_event)
residence_event = event_profile("residence")
residence_event.setDate(1910, 1, 1)
all_events.append(residence_event)
baptism_event = event_profile("baptism")
baptism_event.setDate(1901, 1, 12)
all_events.append(baptism_event)
marriage_event = event_profile("marriage")
marriage_event.setDate(1930, 6, 1)
all_events.append(marriage_event)
death_event = event_profile("death")
death_event.setDate(1970, 1, 1)
all_events.append(death_event)
burial_event = event_profile("burial")
burial_event.setDate(1970, 1, 2)
all_events.append(burial_event)

#This function will provide a True if the dates are consistent (i.e. your are not getting baptised before being born of after dying)
checkDateConsistency(all_events)

#This function will provide the best date, taking 2 dates. In the following case, it will take reisdence date as being more accurate
getBestDate(date(1910, 2, 1), "AFTER", date(1910, 5, 1), "EXACT")

GENERIC_PLACE_STRING = "Portillo,Valladolid,Castile and Leon,Spain"
#This function provides a generic location in standard location format. It is using MAPBOX API in behind
get_formatted_location(GENERIC_PLACE_STRING)

my_name = "John Smith"
#It splits a given name into the name and surname. It checks the data with a database of names and surnames.
name, surname = get_name_surname_from_complete_name(my_name, language="en")
 def comparison_score(self,
                      profile,
                      data_language="en",
                      name_convention="father_surname"):
     '''
     Get the score value in comparison
     '''
     score, factor = get_score_compare_names(self.getName(),
                                             self.getSurname(),
                                             profile.getName(),
                                             profile.getSurname(),
                                             language=data_language,
                                             convention=name_convention)
     #We compare now all the potential names, in case that there is a different convention
     score_nicks = 0
     factor_nicks = 0
     for complete_name_self in self.get_all_names():
         name_self, surname_self, _ = get_name_surname_from_complete_name(
             complete_name_self,
             convention=name_convention,
             language=data_language)
         for complete_name_other in profile.get_all_names():
             name_other, surname_other, _ = get_name_surname_from_complete_name(
                 complete_name_other,
                 convention=name_convention,
                 language=data_language)
             score_int, factor_int = get_score_compare_names(
                 name_self,
                 surname_self,
                 name_other,
                 surname_other,
                 language=data_language,
                 convention=name_convention)
             #We take the bigger of the scores
             if (score_int > score_nicks) and (factor_int > factor_nicks):
                 score_nicks = score_int
                 factor_nicks = factor_int
     #We only take the new score if bigger than the previous one, but we include a penalty (we introduce preference to the score given by formal names)
     if (score_nicks > score) and (factor_nicks > factor) and (
             score_nicks * factor_nicks * 0.5 > score * factor):
         score = score_nicks * 0.5
         factor = factor_nicks
     #Comparing big differences in events
     score1, factor1 = score_factor_birth_and_death(
         self.get_earliest_event_in_event_form(),
         self.get_latest_event_in_event_form(), profile.getEvents())
     score2, factor2 = score_factor_birth_and_death(
         profile.get_earliest_event_in_event_form(),
         self.get_latest_event_in_event_form(), self.getEvents())
     #In this stage we add all obtained scores and factors.
     score += score1 + score2
     factor = factor * factor1 * factor2
     #Comparing gender
     if (self.getGender()) and (profile.getGender()):
         if self.getGender() == profile.getGender():
             score += 0.5
         elif (self.getGender() != "U") and (profile.getGender() != "U"):
             factor = 0.1 * factor
     for event_name in MERGE_EVENTS:
         self_event = self.get_specific_event(event_name)
         other_event = profile.get_specific_event(event_name)
         if event_name == "marriage":
             if len(self_event) > 0: self_event = self_event[0]
             else: self_event = None
             if len(other_event) > 0: other_event = other_event[0]
             else: other_event = None
         if self_event and other_event and self_event.is_any_date_available(
         ) and other_event.is_any_date_available():
             score_temp, factor_temp = get_score_compare_dates(
                 self_event, other_event)
             if (not ((self_event.get_event_type() == "marriage") and
                      (factor_temp < 1.0))) and (
                          self_event.get_event_type() != "residence"):
                 score += score_temp
                 factor = factor * factor_temp
     return score, factor