def test_get_name_from_complete_name(self): ''' Test get name from complete name ''' name1 = "José Martínez Pérez " name, surname, _ = get_name_surname_from_complete_name(name1, convention="spanish_surname") assert(name == "José") assert(surname == "Martínez Pérez") name2 = "John Smith" name, surname, _ = get_name_surname_from_complete_name(name2) assert(name == "John") assert(surname == "Smith") name, surname, _ = get_name_surname_from_complete_name(name2, convention="wrong_convention") assert(name == None) assert(surname == None) #This one makes sure that having a spanish naming convention with only one surname will work. name3 = "Benito Molpecérez" name, surname, _ = get_name_surname_from_complete_name(name3, convention = "spanish_surname") assert(name == "Benito") assert(surname == "Molpecérez") name4 = "Valentín Lupicino" name, surname, _ = get_name_surname_from_complete_name(name4, convention = "spanish_surname", language="es") assert(name == name4) assert(surname == "") name5 = "Valentin Lupicino Martin" name, surname, _ = get_name_surname_from_complete_name(name5, convention = "spanish_surname", language="es") assert(name == "Valentín Lupicino") assert(surname == "Martín") name6 = "Pedro" name, surname, _ = get_name_surname_from_complete_name(name6, convention = "spanish_surname", language="es") assert(name == "Pedro") assert(surname == "") name7 = "Hijinia" name, surname, _ = get_name_surname_from_complete_name(name7, convention = "spanish_surname", language="es") assert(name == "Higinia") assert(surname == "") #Bug with particle San in Spanish name8 = "Michaela San Miguel" name, surname, _ = get_name_surname_from_complete_name(name8, convention = "spanish_surname", language="es") assert(name == "Micaela") assert(surname == "San Miguel") #Deal with unkonwn surnames name9 = "Pedro María Ortega Cynara" name, surname, _ = get_name_surname_from_complete_name(name9, convention = "spanish_surname", language="es") assert(name == "Pedro María") assert(surname == "Ortega Cynara")
def handle_data(self, data): if self.inname: self.inname = False name, surname, _ = get_name_surname_from_complete_name( data, convention="spanish_surname", language="es") profile = gen_profile(name, surname) self.records.append(profile)
def handle_endtag(self, tag): if tag == 'div' and self.inside_results: #We closed one of the tags if (self.internal_div > 0): self.internal_div -= 1 if (self.internal_div == 0): self.inside_results = False if self.inside_results: if (tag == "li"): name, surname, _ = get_name_surname_from_complete_name(self.name, convention="spanish_surname") prof_record = gen_profile(name, surname) prof_record.setWebReference(self.web_link) prof_record.setCheckedDate("death", self.death_date.year, self.death_date.month,self.death_date.day,"EXACT") prof_record.setComments(self.comments) self.records.append(prof_record)
def handle_data(self, data): if self.inside_description: self.inside_description = False result_AGE = re.search('a los (.*) años', data.lower()) result_LOC = re.search('fallecido en (.*) el día', data.lower()) if result_AGE: self.age_here = True self.age = result_AGE.group(1) if result_LOC: self.location_here = True self.location = result_LOC.group(1) self.comment = data if self.inside_profile: self.inside_profile = False self.name = data.replace(" : Fallecimiento", "").replace( "Don ", "").replace("Doña ", "").replace("D. ", "").replace("DON ", "").replace("DOÑA ", "") if self.ending_citation and self.inside_citation: name, surname, _ = get_name_surname_from_complete_name( self.name, convention="spanish_surname") profile = gen_profile(name, surname) profile.setCheckedGender(self.sex) profile.setWebReference(self.weblink) profile.setComments(self.comment) profile.setCheckedDateWithDates("death", self.death_date, "EXACT") if (self.age_here): self.age_here = False #Just in case we have not extracted the right age of the person if (self.age.isdigit()): profile.setCheckedDate("birth", self.death_date.year - int(self.age), accuracy="ABOUT") if (self.location_here): self.location_here = False profile.setPlaces("death", self.location, language="es") self.records.append(profile) #We just mark the end of the profile extraction self.ending_citation = False self.inside_citation = False
def handle_endtag(self, tag): if self.inside_description: name, surname, _ = get_name_surname_from_complete_name( self.name, convention="spanish_surname") profile = gen_profile(name, surname) profile.setWebReference(self.weblink) profile.setCheckedDate("death", self.death_date.year, self.death_date.month, self.death_date.day, "EXACT") #To avoid mistakes in the parsing or issues in registers if (self.age) and (self.age > 0) and (self.age < MAXIMUM_LIFESPAN): profile.setCheckedDate("birth", self.death_date.year - self.age, accuracy="ABOUT") if (self.location): profile.setPlaces("death", self.location, language="es") self.inside_description = False self.initiate_person_data() self.profiles.append(profile)
def handle_starttag(self, tag, attrs): if tag == "a": for attr in attrs: if attr[0] == "class" and attr[1] == "notice_name_link": self.inside_profile = True elif attr[0] == "href": self.web_link = "https://enmemoria.lavanguardia.com" + attr[1] elif attr[0] == "title": name_data = attr[1].replace("Fallecimiento", "").replace(":","").strip() name, surname, _ = get_name_surname_from_complete_name(name_data, convention="spanish_surname") self.name = name self.surname = surname elif tag == "p" and self.inside_profile: self.inside_data = True elif tag == "meta" and self.inside_profile: itemprop = None content = None for attr in attrs: if attr[0] == "itemprop": itemprop = attr[1] elif attr[0] == "content": content = attr[1] if itemprop == "datePublished": self.death_date = datetime.strptime(content.strip(),"%Y-%m-%d" )
def __get_profiles__(self): ''' This function will take all different profiles included inside the excel file ''' current_sheet = self.loaded_data[self.sheet_title] #Iterator of missing inptus number_missing = 0 #The id number to be used id_profiles = 0 #Temporal variable checking the correct reading correct_introduction = True #Intermediate variables for potential parent surnames in the input file potential_father_surname = [] potential_father_surname_repetitions = [] potential_mother_surname = [] potential_mother_surname_repetitions = [] #Intermediate variables for potential parent names in the input file potential_father_name = [] potential_father_name_repetitions = [] potential_mother_name = [] potential_mother_name_repetitions = [] #We firstly detect the surnames of the parents of the profile,we cannot avoid the double #iteration for row in range(self.initial_row+1, self.loaded_data[self.sheet_title].max_row+1): for column_index in range(column_index_from_string(self.initial_column),self.loaded_data[self.sheet_title].max_column): column_criteria = current_sheet.cell(row=self.initial_row, column=column_index).value cell_value = current_sheet.cell(row=row, column=column_index).value if (column_criteria in ["father_full_name", "mother_full_name"] ): #If the cell_value is null we shall avoid continuing if (cell_value != None): name_data = get_name_surname_from_complete_name(cell_value, convention=self.naming_convention, language=self.language) #We have two surnames or one? surname_cand = name_data[1] if (name_data[2] == 2): surname_cand = name_data[1].split()[0] if(column_criteria == "father_full_name"): if (not surname_cand in potential_father_surname): potential_father_surname.append(surname_cand) potential_father_surname_repetitions.append(1) else: index = potential_father_surname.index(surname_cand) potential_father_surname_repetitions[index] = potential_father_surname_repetitions[index] + 1 if (not name_data[0] in potential_father_name): potential_father_name.append(name_data[0]) potential_father_name_repetitions.append(1) else: index = potential_father_name.index(name_data[0]) potential_father_name_repetitions[index] = potential_father_name_repetitions[index] + 1 elif(column_criteria == "mother_full_name"): if (not surname_cand in potential_mother_surname): potential_mother_surname.append(surname_cand) potential_mother_surname_repetitions.append(1) else: index = potential_mother_surname.index(surname_cand) potential_mother_surname_repetitions[index] = potential_mother_surname_repetitions[index] + 1 if (not name_data[0] in potential_mother_name): potential_mother_name.append(name_data[0]) potential_mother_name_repetitions.append(1) else: index = potential_mother_name.index(name_data[0]) potential_mother_name_repetitions[index] = potential_mother_name_repetitions[index] + 1 index_father_surname = potential_father_surname_repetitions.index(max(potential_father_surname_repetitions)) index_mother_surname = potential_mother_surname_repetitions.index(max(potential_mother_surname_repetitions)) father_surname = potential_father_surname[index_father_surname] mother_surname = potential_mother_surname[index_mother_surname] index_father_name = potential_father_name_repetitions.index(max(potential_father_name_repetitions)) index_mother_name = potential_mother_name_repetitions.index(max(potential_mother_name_repetitions)) father_name = potential_father_name[index_father_name] mother_name = potential_mother_name[index_mother_name] self.father_profile = gen_profile(father_name, father_surname) self.mother_profile = gen_profile(mother_name, mother_surname) children_surname = get_children_surname(father_surname, mother_surname, self.naming_convention) #Now we read the complete file for row in range(self.initial_row+1, self.loaded_data[self.sheet_title].max_row+1): included_profile = gen_profile("TBD", children_surname) included_right = True for column_index in range(column_index_from_string(self.initial_column),self.loaded_data[self.sheet_title].max_column): column_criteria = current_sheet.cell(row=self.initial_row, column=column_index).value cell_value = current_sheet.cell(row=row, column=column_index).value #We are ignoring all those cells that are empty. if ( cell_value ): this_introduction = True #Ok, now we go one by one each of the different values if(column_criteria == "gender"): this_introduction = included_profile.setCheckedGender(cell_value) elif (column_criteria in LOCATION_EQUIVALENCE.keys()): included_profile.setPlaces(LOCATION_EQUIVALENCE[column_criteria], cell_value, self.language) elif (column_criteria == "person_url"): included_profile.setWebReference("https://familysearch.org/" +cell_value) elif (column_criteria in date_fields.keys()): #Notice that we shall detect if the given date is a year or a specific date #we will make the different using "about" and using datetime in the background if(is_year(cell_value)): this_introduction = self.__include_a_date__(column_criteria, included_profile, datetime.strptime(str(cell_value.replace(" ", "")), "%Y").date(), "ABOUT") else: this_introduction = self.__include_a_date__(column_criteria, included_profile, datetime.strptime(cell_value, "%d %b %Y").date(), "EXACT") elif(column_criteria == "full_name"): included_profile.set_name(get_name_from_fullname(cell_value,potential_father_surname, potential_mother_surname, language=self.language)) #In the case the name if not the same, we create it as nickname if (cell_value != included_profile.returnFullName()): included_profile.add_nickname(cell_value) elif (column_criteria == "spouse_full_name"): #Here we create a new profile using the surname of the guy names = get_name_surname_from_complete_name(cell_value, convention=self.naming_convention, language=self.language) partner = gen_profile(names[0], names[1]) partner.set_id(id_profiles) #If the surname is changed we shall include the previous surname in the nicknames if (cell_value != partner.returnFullName()): partner.add_nickname(cell_value) #Now we link the profiles included_profile.set_marriage_id_link(id_profiles) self.related_profiles[id_profiles] = partner elif (column_criteria == "other_full_names"): #The separator provided by family search is semicolumn parents = cell_value.split(";") #We obtain firstly the different names father_name, father_surname, _ = get_name_surname_from_complete_name(parents[0], convention=self.naming_convention, language=self.language) if (len(parents) == 2): mother_name, mother_surname, _ = get_name_surname_from_complete_name(parents[1], convention=self.naming_convention, language=self.language) #The algorithm provides an empty surname, we fill it with not known if (father_surname == ""): father_surname = NOT_KNOWN_VALUE if (mother_surname == ""): mother_surname = NOT_KNOWN_VALUE #Create the standard profiles father = gen_profile(father_name, father_surname) mother = gen_profile(mother_name, mother_surname) #If the surname is changed we shall include the previous surname in the nicknames if (parents[0] != father.returnFullName()): father.add_nickname(parents[0]) if (len(parents) == 2) and (parents[1] != mother.returnFullName()): mother.add_nickname(parents[1]) #add gender father.setCheckedGender("M") mother.setCheckedGender("F") self.parents_profiles[id_profiles] = [father, mother] elif (column_criteria in ignored_fields): pass else: number_missing = number_missing + 1 logging.warning(COLUMN_NOT_FOUND + column_criteria) if (not this_introduction): included_right = False #This is a way to later on identify the link between the profiles id_profiles += 1 if(not included_right) : correct_introduction = False self.profiles.append(included_profile) #Now we know the data we fix some with the proper logic for profile_obtained in self.profiles: #If the baptism and birth are close enough we assign the birth place to the baptism place birth_d = profile_obtained.gen_data.get("birth_date", None) bapt_d = profile_obtained.gen_data.get("baptism_date", None) if birth_d and bapt_d: difference = bapt_d - birth_d if abs(difference.days) < DIFFERNCE_BIRTH_BAPT: place_birth = profile_obtained.gen_data.get("birth_place", {}).get("raw", None) place_baptism = profile_obtained.gen_data.get("baptism_place", {}).get("raw", None) if place_baptism and not place_birth: profile_obtained.setPlaces("birth_place",get_location_standard(profile_obtained.gen_data["baptism_place"]), self.language) if profile_obtained.gen_data.get("marriage_link", None) in self.related_profiles.keys(): id_of_marriage = profile_obtained.gen_data["marriage_link"] partner = self.related_profiles[id_of_marriage] partner.setWebReference(profile_obtained.gen_data["web_ref"]) #It is a partner so we add as opposite sex! partner.setCheckedGender(get_partner_gender(profile_obtained.gen_data["gender"])) partner.setCheckedDate("marriage_date", profile_obtained.gen_data["marriage_date"], profile_obtained.gen_data["accuracy_marriage_date"] ) partner.setPlaces("marriage_place", profile_obtained.gen_data["marriage_place"]["raw"], language=self.language ) if id_of_marriage in self.parents_profiles.keys(): father = self.parents_profiles[id_of_marriage][0] mother = self.parents_profiles[id_of_marriage][1] father.setWebReference(profile_obtained.gen_data["web_ref"]) mother.setWebReference(profile_obtained.gen_data["web_ref"]) surnames = get_splitted_name_from_complete_name(partner.gen_data["surname"], language=self.language)[0] if (father.gen_data["surname"] == NOT_KNOWN_VALUE): #It might be the case that the surname is empty #Ok the data was not including the right data, but we know the surname if (self.naming_convention == "spanish_surname" and len(surnames) != 0): father.gen_data["surname"] = surnames[0] else: father.gen_data["surname"] = partner.gen_data["surname"] if (mother.gen_data["surname"] == NOT_KNOWN_VALUE) and (self.naming_convention == "spanish_surname") and (len(surnames) == 2): mother.gen_data["surname"] = surnames[1] if (self.naming_convention == "spanish_surname"): #We need to ensure 2 surnames in spanish naming conventions if not (mother.gen_data["surname"] in partner.gen_data["surname"]) or (len(partner.gen_data["surname"].split()) == 1): #In the case we have 2 surnames, we try to eliminate the second surnames. partner_surname_data = get_splitted_name_from_complete_name(partner.gen_data["surname"], language=self.language) mother_surname_data = get_splitted_name_from_complete_name(mother.gen_data["surname"], language=self.language) if len(partner.gen_data["nicknames"]) == 0: partner.add_nickname(partner.returnFullName()) partner.gen_data["surname"] = " ".join([partner_surname_data[0][0], mother_surname_data[0][0]]) #Finally, let's merge those profiles that are the same! indexes_to_remove = [] iterating_list = list(self.profiles) for i in range(len(iterating_list)): #We are going one by one all the different profiles if not i in indexes_to_remove: for j, other_prof in enumerate(iterating_list[i+1:]): merged = self.profiles[i].merge_profile(other_prof, language=self.language, convention=self.naming_convention) if merged: indexes_to_remove.append(i+j+1) new_values = list(set(indexes_to_remove)) new_values.sort() for deletion in reversed(new_values): del self.profiles[deletion] return correct_introduction
all_events.append(birth_event) residence_event = event_profile("residence") residence_event.setDate(1910, 1, 1) all_events.append(residence_event) baptism_event = event_profile("baptism") baptism_event.setDate(1901, 1, 12) all_events.append(baptism_event) marriage_event = event_profile("marriage") marriage_event.setDate(1930, 6, 1) all_events.append(marriage_event) death_event = event_profile("death") death_event.setDate(1970, 1, 1) all_events.append(death_event) burial_event = event_profile("burial") burial_event.setDate(1970, 1, 2) all_events.append(burial_event) #This function will provide a True if the dates are consistent (i.e. your are not getting baptised before being born of after dying) checkDateConsistency(all_events) #This function will provide the best date, taking 2 dates. In the following case, it will take reisdence date as being more accurate getBestDate(date(1910, 2, 1), "AFTER", date(1910, 5, 1), "EXACT") GENERIC_PLACE_STRING = "Portillo,Valladolid,Castile and Leon,Spain" #This function provides a generic location in standard location format. It is using MAPBOX API in behind get_formatted_location(GENERIC_PLACE_STRING) my_name = "John Smith" #It splits a given name into the name and surname. It checks the data with a database of names and surnames. name, surname = get_name_surname_from_complete_name(my_name, language="en")
def comparison_score(self, profile, data_language="en", name_convention="father_surname"): ''' Get the score value in comparison ''' score, factor = get_score_compare_names(self.getName(), self.getSurname(), profile.getName(), profile.getSurname(), language=data_language, convention=name_convention) #We compare now all the potential names, in case that there is a different convention score_nicks = 0 factor_nicks = 0 for complete_name_self in self.get_all_names(): name_self, surname_self, _ = get_name_surname_from_complete_name( complete_name_self, convention=name_convention, language=data_language) for complete_name_other in profile.get_all_names(): name_other, surname_other, _ = get_name_surname_from_complete_name( complete_name_other, convention=name_convention, language=data_language) score_int, factor_int = get_score_compare_names( name_self, surname_self, name_other, surname_other, language=data_language, convention=name_convention) #We take the bigger of the scores if (score_int > score_nicks) and (factor_int > factor_nicks): score_nicks = score_int factor_nicks = factor_int #We only take the new score if bigger than the previous one, but we include a penalty (we introduce preference to the score given by formal names) if (score_nicks > score) and (factor_nicks > factor) and ( score_nicks * factor_nicks * 0.5 > score * factor): score = score_nicks * 0.5 factor = factor_nicks #Comparing big differences in events score1, factor1 = score_factor_birth_and_death( self.get_earliest_event_in_event_form(), self.get_latest_event_in_event_form(), profile.getEvents()) score2, factor2 = score_factor_birth_and_death( profile.get_earliest_event_in_event_form(), self.get_latest_event_in_event_form(), self.getEvents()) #In this stage we add all obtained scores and factors. score += score1 + score2 factor = factor * factor1 * factor2 #Comparing gender if (self.getGender()) and (profile.getGender()): if self.getGender() == profile.getGender(): score += 0.5 elif (self.getGender() != "U") and (profile.getGender() != "U"): factor = 0.1 * factor for event_name in MERGE_EVENTS: self_event = self.get_specific_event(event_name) other_event = profile.get_specific_event(event_name) if event_name == "marriage": if len(self_event) > 0: self_event = self_event[0] else: self_event = None if len(other_event) > 0: other_event = other_event[0] else: other_event = None if self_event and other_event and self_event.is_any_date_available( ) and other_event.is_any_date_available(): score_temp, factor_temp = get_score_compare_dates( self_event, other_event) if (not ((self_event.get_event_type() == "marriage") and (factor_temp < 1.0))) and ( self_event.get_event_type() != "residence"): score += score_temp factor = factor * factor_temp return score, factor