示例#1
0
 def get_existing_before(self, profile_to_check, link_string, prof_id):
     '''
     This function will detect if the profile has a link existing before
     it will create a task informing that profile should not be avoided
     '''
     #We first get all webs of the current profile
     current_webs = profile_to_check.get_all_webs()
     for web in current_webs:
         #This will mean that we have an existing match before
         if web["name"] == link_string:
             profile = self.db_input.get_profile_by_ID(prof_id)
             details = MATCH_POTENTIAL_INFO_EXISTING + profile_to_check.nameLifespan() + " " + str(profile_to_check.get_id())
             include_task_no_duplicate(profile, MATCH_POTENTIAL_EXISTING, 1, details)
             print_out("-    AVOIDING INTRODUCTION OF EXISTING LINK IN "+ str(profile_to_check.nameLifespan()) )
             return True
     return False
 def _conflict_storing(self, profile_rm, conflicted_profiles_ids, db_conflict):
     '''
     Internal function to avoid duplicates. It stores a conflict of matches deviation
     profile_rm is a profile kind
     conflicted_profile_ids is a list of ids with conflict
     '''
     conflict_str = ""
     for prof_conf_id in conflicted_profiles_ids:
         prof_conf = db_conflict.get_profile_by_ID(prof_conf_id)
         conflict_str += str(prof_conf.nameLifespan()) + "  "
     print_out("-    CONFLICT of profile " + str(profile_rm.nameLifespan()) + " WITH PROFILE(S) " + conflict_str)
     #This is a conflict, we should have a single match!!!
     self.conflict_profiles[profile_rm.get_id()] = conflicted_profiles_ids
     details_info = MATCH_CONFLICT_INFO
     for ids_geni in conflicted_profiles_ids:
         details_info += self.database_geni.get_profile_by_ID(ids_geni).get_this_profile_url() + "     "
     include_task_no_duplicate(profile_rm, MATCH_CONFLICT_TASK, 1, details_info)
示例#3
0
 def process(self,
             profiles_2_analyze="all",
             storage=False,
             threshold=360,
             avoid_import_living_from=["input", "check"]):
     '''
     Determines the full database and detects potential matches
     
     profiles_2_analyze = an array of profiles for the first database which shall be analyzed, in case of no
     input it will analyze all profiles.
     avoid_import_living_from will be a list with either input or check, it will not import the living from that database
     '''
     kind_match = self.db_check.get_db_kind()
     match_str = str(kind_match) + MATCH
     matcher_profiles = match_single_profile(
         self.db_input,
         self.db_check,
         data_language=self.data_language,
         name_convention=self.name_convention)
     linked_profiles = {}
     for prof in self.db_input.get_all_profiles():
         if prof.get_specific_web(kind_match):
             linked_profiles[prof.get_id()] = prof
     print_out(PROCESS_MATCH_NUMBER_OF_IMPACTS_BEGIN + str(kind_match) +
               PROCESS_MATCH_NUMBER_OF_IMPACTS_END +
               str(len(linked_profiles)))
     for prof_id in linked_profiles:
         if (profiles_2_analyze == "all") or (prof_id
                                              in profiles_2_analyze):
             prof_in_study = linked_profiles[prof_id]
             if continue_match(prof_in_study,
                               match_str,
                               threshold=threshold):
                 prof_linked_id = prof_in_study.get_specific_web(
                     kind_match)["url"]
                 prof_linked = self.db_check.get_profile_by_ID(
                     prof_linked_id)
                 #In order to get the marriage, first we get the family from the other database
                 _, family_profile = self.db_check.get_family_from_child(
                     prof_linked.get_id())
                 _, family_input = self.db_input.get_family_from_child(
                     prof_id)
                 #We will only analyze the profiles which have been introduced
                 loc_research = get_research_log_id(prof_in_study,
                                                    storage=storage)
                 non_matched_profiles_input, non_matched_profiles_check, conflict_profiles, matched_profiles = matcher_profiles.match(
                     prof_id)
                 #We create the following slots that will be used for the introduction of confirmed candidates inside the database
                 check_non_matches_existing_in_input = {}
                 input_non_matches_existing_in_check = {}
                 #REMOVAL OF LIVING
                 #If the option is selected to not publish data that is restricted like living we remove teh matched characters
                 temp_data = {
                     "input": {
                         "non_matches": non_matched_profiles_input,
                         "database": self.db_input,
                         "other_db": self.db_check,
                         "family_is_child": family_input,
                         "current_id_to_add": prof_linked_id,
                         "current_id_matched": prof_id,
                         "existing_prof":
                         input_non_matches_existing_in_check
                     },
                     "check": {
                         "non_matches": non_matched_profiles_check,
                         "database": self.db_check,
                         "other_db": self.db_input,
                         "family_is_child": family_profile,
                         "current_id_to_add": prof_id,
                         "current_id_matched": prof_linked.get_id(),
                         "existing_prof":
                         check_non_matches_existing_in_input
                     }
                 }
                 for kind_db in temp_data:
                     #We create a temporal copy to remove the data
                     temp_input = list(temp_data[kind_db]["non_matches"])
                     for prof in temp_input:
                         potential_profile = temp_data[kind_db][
                             "database"].get_profile_by_ID(prof)
                         #REMOVAL OF LIVING
                         #If the option is selected to not publish data that is restricted like living we remove teh matched characters
                         if (kind_db in avoid_import_living_from
                             ) and potential_profile.getLiving():
                             del temp_data[kind_db]["non_matches"][prof]
                             print_out(
                                 "-    AVOIDING INTRODUCTION OF LIVING " +
                                 str(potential_profile.nameLifespan()) +
                                 " FROM THE DATABASE " + temp_data[kind_db]
                                 ["database"].get_db_kind())
                 #Prior to starting the overall copy/match, we should check first if potential candidates exists in the input database
                 temp_checking = dict(non_matched_profiles_check)
                 for prof in temp_checking:
                     potential_profile = self.db_check.get_profile_by_ID(
                         prof)
                     #MATCHING: we look if the profile also exists before
                     matches = self.db_input.get_potential_profile_match(
                         potential_profile,
                         data_language=self.data_language,
                         name_convention=self.name_convention)
                     #Profiles might have been matched before
                     Existing_found = False
                     duplicate_names = ""
                     for candidate_prof in matches.keys():
                         existing_prof = self.db_input.get_profile_by_ID(
                             candidate_prof)
                         if (existing_prof.getName() != NOT_KNOWN_VALUE
                             ) and (existing_prof.getSurname() !=
                                    NOT_KNOWN_VALUE):
                             #We only continue looking for alternatives if there is no uncertain information int he profile
                             duplicate_names += existing_prof.nameLifespan(
                             ) + " "
                             for web_data in existing_prof.get_all_webs():
                                 #We now know that the profiles are the same in this database, we can proceed to add this existing one
                                 if web_data[
                                         "url"] == potential_profile.get_this_profile_url(
                                         ):
                                     #We store the link of a profile id in check to a profile existing in input (profile class)
                                     check_non_matches_existing_in_input[
                                         prof] = existing_prof
                                     Existing_found = True
                             if matches[candidate_prof][
                                     "score*factor"] >= FACTOR_DUPLICATE:
                                 check_non_matches_existing_in_input[
                                     prof] = existing_prof
                                 Existing_found = True
                     if (len(matches.keys()) > 0) and not Existing_found:
                         #In this case we have a certain potential match. We should not add and rather, leave for checking an human being
                         del non_matched_profiles_check[prof]
                         print_out("-    POTENTIAL DUPLICATE of profile " +
                                   str(potential_profile.nameLifespan()) +
                                   " WITH PROFILE(S) ID(S) " +
                                   str(duplicate_names))
                         #This is a conflict, we should avoid duplicating job of checking and solving
                         conflict_profiles[prof] = matches
                         details_info = (
                             "Potential existing duplicates for profile " +
                             potential_profile.nameLifespan() +
                             " with web " +
                             potential_profile.get_this_profile_url() +
                             " in the profiles:      ")
                         for ids_check in matches:
                             temp_prof = self.db_input.get_profile_by_ID(
                                 ids_check)
                             details_info += str(
                                 ids_check
                             ) + " : " + temp_prof.nameLifespan(
                             ) + "        "
                         include_task_no_duplicate(
                             prof_in_study, MATCH_POTENTIAL_DUPLICATE, 1,
                             details_info)
                 ###################
                 #MATCH INTRODUCTION
                 ###################
                 for db_kind in temp_data:
                     #CASE MATCH: Missing a parents or several parents.
                     non_match_now = temp_data[db_kind]["non_matches"]
                     db_now = temp_data[db_kind]["database"]
                     db_other = temp_data[db_kind]["other_db"]
                     family_is_child = temp_data[db_kind]["family_is_child"]
                     current_id_to_add = temp_data[db_kind][
                         "current_id_to_add"]
                     if (("father" in non_match_now.values())
                             or ("mother" in non_match_now.values())):
                         father_id = None
                         mother_id = None
                         father_profile = None
                         mother_profile = None
                         #We shall first obtain the id of the profiles from the check database (i.e. geni)
                         if ("father" in non_match_now.values()):
                             father_id = list(non_match_now.keys())[list(
                                 non_match_now.values()).index("father")]
                             father_profile = db_now.get_profile_by_ID(
                                 father_id)
                         if ("mother" in non_match_now.values()):
                             mother_id = list(non_match_now.keys())[list(
                                 non_match_now.values()).index("mother")]
                             mother_profile = db_now.get_profile_by_ID(
                                 mother_id)
                         Intro_sentence = MATCH_ADDING_PROFILES
                         if (father_id
                                 in temp_data[kind_db]["existing_prof"]
                             ) or (mother_id
                                   in temp_data[kind_db]["existing_prof"]):
                             Intro_sentence = MATCH_EXISTING_PROFILES
                         #We also add the link!
                         if father_profile and mother_profile:
                             Intro_sentence += father_profile.nameLifespan(
                             ) + AND_STRING + mother_profile.nameLifespan(
                             ) + TO_STRING + db_other.get_db_kind()
                         elif father_profile:
                             Intro_sentence += father_profile.nameLifespan(
                             ) + TO_STRING + db_other.get_db_kind()
                         elif mother_profile:
                             Intro_sentence += mother_profile.nameLifespan(
                             ) + TO_STRING + db_other.get_db_kind()
                         #We inform of the inclusion of the new profiles
                         print_out(Intro_sentence)
                         marriage_event = family_is_child.getMarriage()
                         #If the parent was existing before, we avoid double introduction
                         father_to_add = temp_data[kind_db][
                             "existing_prof"].get(father_id, father_profile)
                         mother_to_add = temp_data[kind_db][
                             "existing_prof"].get(mother_id, mother_profile)
                         if father_to_add and mother_to_add:
                             #So.. we add the new profiles
                             new_father_id, new_mother_id, _ = db_other.add_parents(
                                 child_profile_id=current_id_to_add,
                                 father_profile=father_to_add,
                                 mother_profile=mother_to_add,
                                 marriage_event=marriage_event)
                         elif father_to_add:
                             #So.. we just add the father
                             new_father_id, new_mother_id, _ = db_other.add_parents(
                                 child_profile_id=current_id_to_add,
                                 father_profile=father_to_add,
                                 marriage_event=marriage_event)
                         elif mother_to_add:
                             #So.. we just add the father
                             new_father_id, new_mother_id, _ = db_other.add_parents(
                                 child_profile_id=current_id_to_add,
                                 mother_profile=mother_to_add,
                                 marriage_event=marriage_event)
                         if db_kind == "check":
                             if father_profile:
                                 self.add_match_to_prof(
                                     new_father_id, father_profile)
                             if mother_profile:
                                 self.add_match_to_prof(
                                     new_mother_id, mother_profile)
                         #We remove the profiles, as will be added
                         if father_id: del non_match_now[father_id]
                         if mother_id: del non_match_now[mother_id]
                 #################
                 # CONTINUE HERE
                 #################
                 #PARTNERS: Review of partners for inclusion
                 partners_input = self.db_input.get_partners_from_profile(
                     prof_id)
                 matched_partners = {}
                 for partner_input in partners_input:
                     if partner_input in matched_profiles:
                         matched_partners[partner_input] = matched_profiles[
                             partner_input]
                 for kind_db in temp_data:
                     #We create a temporal copy to remove the data
                     temp_2_use = list(temp_data[kind_db]["non_matches"])
                     for prof in temp_2_use:
                         #We might delete some profiles (children) in the middle, that's why we check first if the profile is in the list
                         if (prof in temp_data[kind_db]["non_matches"]
                             ) and temp_data[kind_db]["non_matches"][
                                 prof] == "partner":
                             partner_profile = temp_data[kind_db][
                                 "database"].get_profile_by_ID(prof)
                             #Checking is already having a link to the database
                             existing_link = self.get_existing_before(
                                 partner_profile,
                                 self.db_check.get_db_kind(), prof_id)
                             #Now, if the partner is not accessible due to data restriction, we will skip this step
                             if partner_profile.get_accessible(
                             ) and not existing_link:
                                 #Good, we will need now to add the new partner to the INPUT area
                                 Intro_sentence = PROCESS_ADD_PROFILE_BEGIN
                                 if prof in temp_data[kind_db][
                                         "existing_prof"]:
                                     Intro_sentence = PROCESS_LINK_PROFILE_BEGIN
                                 print_out(Intro_sentence +
                                           partner_profile.nameLifespan() +
                                           TO_STRING + temp_data[kind_db]
                                           ["other_db"].get_db_kind() +
                                           PROCESS_ADD_PROFILE_END +
                                           temp_data[kind_db]["non_matches"]
                                           [prof])
                                 family_check = temp_data[kind_db][
                                     "database"].get_family_from_partners(
                                         temp_data[kind_db]
                                         ["current_id_matched"],
                                         partner_profile.get_id())
                                 marriage_event = temp_data[kind_db][
                                     "database"].get_family_by_ID(
                                         family_check).getMarriage()
                                 #The profile might be existing before and we do not need to add it again, so we have a potential deviation
                                 #If we store it before, in this case, we will use the existing profile, if not, we continue we the one in the other database
                                 partner_to_introduce = temp_data[kind_db][
                                     "existing_prof"].get(
                                         prof, partner_profile)
                                 id_partner, _ = temp_data[kind_db][
                                     "other_db"].add_partner(
                                         temp_data[kind_db]
                                         ["current_id_to_add"],
                                         partner_to_introduce,
                                         marriage=marriage_event)
                                 if kind_db == "input":
                                     self.add_match_to_prof(
                                         prof,
                                         temp_data[kind_db]["other_db"].
                                         get_profile_by_ID(id_partner),
                                         adding=False)
                                     matched_partners[prof] = id_partner
                                 elif kind_db == "check":
                                     self.add_match_to_prof(
                                         id_partner, partner_profile)
                                     matched_partners[id_partner] = prof
                             elif not partner_profile.get_accessible():
                                 print_out(
                                     "-    AVOIDING INTRODUCTION OF LIVING "
                                     + str(partner_profile.nameLifespan()) +
                                     " FROM THE DATABASE " +
                                     temp_data[kind_db]
                                     ["database"].get_db_kind())
                                 family_eliminate = temp_data[kind_db][
                                     "database"].get_family_from_partners(
                                         temp_data[kind_db]
                                         ["current_id_matched"],
                                         partner_profile.get_id())
                                 #As the partner is not known and we might have access issues,
                                 #we stop the review of those children in the non accesible partner
                                 for child in family_eliminate.getChildren(
                                 ):
                                     if child in temp_data[kind_db][
                                             "non_matches"]:
                                         del temp_data[kind_db][
                                             "non_matches"][child]
                             #We remove also from the non-matching pending those profiles that have been skipped due to privacy
                             if not existing_link:
                                 del temp_data[kind_db]["non_matches"][
                                     partner_profile.get_id()]
                 #We go ahead looking first for each matched partner
                 for partner_input in matched_partners:
                     partner_check = matched_partners[partner_input]
                     #We need the family which will be the input for the children
                     family_part_input = self.db_input.get_family_from_partners(
                         prof_id, partner_input)
                     family_part_check = self.db_check.get_family_from_partners(
                         prof_linked.get_id(), partner_check)
                     family_part = {
                         "input": family_part_check,
                         "check": family_part_input
                     }
                     family_current = {
                         "input": family_part_input,
                         "check": family_part_check
                     }
                     #INTRODUCTION: CHILDREN in the family
                     for kind_db in temp_data:
                         temp = dict(temp_data[kind_db]["non_matches"])
                         for prof in temp.keys():
                             #We will only select those profiles which are children
                             if (temp_data[kind_db]["non_matches"][prof]
                                     == "child") and (
                                         prof
                                         in temp_data[kind_db]["database"].
                                         get_children_from_family(
                                             family_current[kind_db])):
                                 child_profile = temp_data[kind_db][
                                     "database"].get_profile_by_ID(prof)
                                 #Ok, if the profile is accessible, we go ahead for creation
                                 if (child_profile.get_accessible()):
                                     Intro_sentence = PROCESS_ADD_PROFILE_BEGIN
                                     if prof in temp_data[kind_db][
                                             "existing_prof"]:
                                         Intro_sentence = PROCESS_LINK_PROFILE_BEGIN
                                     print_out(
                                         Intro_sentence +
                                         child_profile.nameLifespan() +
                                         TO_STRING + temp_data[kind_db]
                                         ["other_db"].get_db_kind() +
                                         PROCESS_ADD_PROFILE_END +
                                         temp_data[kind_db]["non_matches"]
                                         [prof])
                                     #If the child was existing before, we add it directly with the right profile, avoiding double introduction
                                     child_to_add = temp_data[kind_db][
                                         "existing_prof"].get(
                                             prof, child_profile)
                                     child_new_ids = temp_data[kind_db][
                                         "other_db"].add_child(
                                             family_part[kind_db],
                                             [child_to_add])
                                     if kind_db == "input":
                                         child_new_prof = self.db_check.get_profile_by_ID(
                                             child_new_ids[0])
                                         self.add_match_to_prof(
                                             prof,
                                             child_new_prof,
                                             adding=False)
                                     elif kind_db == "check":
                                         self.add_match_to_prof(
                                             child_new_ids[0],
                                             child_profile)
                                 else:
                                     print_out(
                                         child_profile.nameLifespan() +
                                         PROCESS_NO_ACCESS)
                                 del temp_data[kind_db]["non_matches"][prof]
                 ################################################################
                 if len({
                         **non_matched_profiles_input,
                         **non_matched_profiles_check,
                         **conflict_profiles
                 }) == 0:
                     #In this case, we have achieved the full matching, we store the information
                     today = datetime.date.today().toordinal()
                     notes_toadd = STATUS_MATCHED + str(today)
                     record_research_log(prof_in_study, match_str,
                                         loc_research, prof_linked_id,
                                         notes_toadd)
                 else:
                     notes_toadd = (STATUS_TO_CHECK + " " * 10 +
                                    "--Missing match input " +
                                    str(non_matched_profiles_input) +
                                    " " * 10 + "--Missing match check " +
                                    str(non_matched_profiles_check) +
                                    " " * 10 + "--Pending conflicts " +
                                    str(conflict_profiles))
                     record_research_log(prof_in_study, match_str,
                                         loc_research, prof_linked_id,
                                         notes_toadd)
             else:
                 print_out("SKIPPING " + prof_in_study.nameLifespan(),
                           log_level=15)
 def match(self, profile_ID):
     '''
     It executes the match, assumes contains a GENI link in the profile_ID
     It will:
     - Generate a web link to the matched profile
     - Return the following:
         - A list of non-matched profiles with the relationship according to standard namings requesting review.
         - A dictonary of conflicts linked to lists of profile in score with request to review
         - A dictionary of matched profiles.
     '''
     #Initialization of the different matcher functions
     self._init_tracking_logs()
     profile_rm = self.database.get_profile_by_ID(profile_ID)
     print_out(str(profile_ID) + " = "  + profile_rm.nameLifespan())
     url = None
     #We confirm is a valid profile, should contain a match
     confirmed = False
     for web_ref in profile_rm.get_all_webs():
         if web_ref["name"] == self.database_geni.get_db_kind():
             confirmed = True
             url = web_ref["url"]
     if not confirmed:
         logging.error(MATCH_PROFILE_ERROR)
         return False
     #This is the profile for analysis
     profile_geni = self.database_geni.get_profile_by_ID(url)
     #We might have an address that has been updated, we double check for updating it in DB
     if url != profile_geni.get_this_profile_url():
         profile_rm.update_web_ref(url = profile_geni.get_this_profile_url(), name = self.database_geni.get_db_kind())
         print_out("UPDATING " + self.database_geni.get_db_kind() + " LINK to " + profile_geni.get_this_profile_url())
     #Starting checking of the parents
     #FATHER
     _, father_rm = self.database.get_father_from_child(profile_rm.get_id())
     _, father_geni = self.database_geni.get_father_from_child(profile_geni.get_id())
     #First case is a potential match between the profiles
     if (father_rm and father_geni):
         self._match_single_pair(father_rm, father_geni)
     #We can only have the father_rm
     elif father_rm:
         self.non_matched_profiles_rm[father_rm.get_id()] = FATHER
         print_out("-    NO MATCH of profile in " + self.database_geni.get_db_kind() + " " + str(father_rm.nameLifespan()) + " Relation = " + FATHER)
     elif father_geni:
         self.non_matched_profiles_geni[father_geni.get_id()] = FATHER
         print_out("-    NO MATCH of profile in " + self.database.get_db_kind() + " " + str(father_geni.nameLifespan()) + " Relation = " + FATHER)
     #Notice that we do not consider the case of no parents at all identified, no match needed.
     #MOTHER
     _, mother_rm = self.database.get_mother_from_child(profile_rm.get_id())
     _, mother_geni = self.database_geni.get_mother_from_child(profile_geni.get_id())
     #First case is a potential match between the profiles
     if (mother_rm and mother_geni):
         self._match_single_pair(mother_rm, mother_geni)
     #We can only have the father_rm
     elif mother_rm:
         self.non_matched_profiles_rm[mother_rm.get_id()] = MOTHER
         print_out("-    NO MATCH of profile in " + self.database_geni.get_db_kind() + " " + str(mother_rm.nameLifespan()) + " Relation = " + MOTHER)
     elif mother_geni:
         self.non_matched_profiles_geni[mother_geni.get_id()] = MOTHER
         print_out("-    NO MATCH of profile in " + self.database.get_db_kind() + " " + str(mother_geni.nameLifespan()) + " Relation = " + MOTHER)
     #Notice that we do not consider teh case of no parents at all identified, no match needed.
     #PARTNERS
     partners_rm = self.database.get_partners_from_profile(profile_rm.get_id())
     partners_geni = self.database_geni.get_partners_from_profile(profile_geni.get_id())
     self._track_2_lists(partners_rm, partners_geni, PARTNER)
     #CHILDREN
     children_rm = self.database.get_all_children(profile_ID)
     children_geni = self.database_geni.get_all_children(url)
     self._track_2_lists(children_rm, children_geni, CHILD)
     return self.non_matched_profiles_rm, self.non_matched_profiles_geni, self.conflict_profiles, self.matched_profiles
 def _track_2_lists(self, profiles_rm, profiles_geni, kind_of_match):
     '''
     Function used for both partners and children as is a common function
     '''
     #We store here the profiles that have been identified in profiles_geni
     profiles_not_identified = list(profiles_geni)
     conflict_potential_dictionary = {}
     for rm_id in profiles_rm:
         #We might be in a situation where very small similarities might create confusion of profiles, we store the previous score
         previous_score = 0
         conflict_match = False
         profile_rm = self.database.get_profile_by_ID(rm_id)
         geni_matches = []
         for geni_id in list(profiles_not_identified):
             profile_geni = self.database_geni.get_profile_by_ID(geni_id)
             score, factor = profile_rm.comparison_score(profile_geni, self.data_language, self.name_convention)
             if score*factor > self.threshold:
                 #OPTIONS:
                 # 1.New profile is the right one
                 # 2.New profile is the first one
                 # 3.New profile is not the right one.but is the previous
                 # 4.New profile is as bad as the others.
                 #Option 1
                 if score*factor > 3*previous_score:
                     recover_profs = list(geni_matches)
                     geni_matches = [geni_id]
                     profiles_not_identified += recover_profs
                     previous_score = score*factor
                     if geni_id in profiles_not_identified: profiles_not_identified.remove(geni_id)
                 #Option 2
                 elif len(geni_matches) == 0:
                     geni_matches.append(geni_id)
                     previous_score = score*factor
                     if geni_id in profiles_not_identified: profiles_not_identified.remove(geni_id)
                 #Option 3
                 elif previous_score >= 3*score*factor:
                     #In this case we ignore... we keep the previous one
                     pass
                 #Option 4
                 else:
                     if score*factor > previous_score: previous_score = score*factor
                     if geni_id in profiles_not_identified: profiles_not_identified.remove(geni_id)
             elif score >= 3*self.threshold:
                 conflict_match = True
                 #This is a common case, where profiles have a minimum difference but still relevant, user to check
                 if rm_id in conflict_potential_dictionary:
                     conflict_potential_dictionary[rm_id].append(geni_id)
                 else:
                     conflict_potential_dictionary[rm_id] = [geni_id]
         #If there is a single match, whatever other conditions, we introduce as a match
         if len(geni_matches) == 1:
             self._match_single_pair(profile_rm, self.database_geni.get_profile_by_ID(geni_matches[0]))
             #In case there has been found also a conflict, the conflict is no longer needed, as we do have a match.
             if rm_id in conflict_potential_dictionary: del conflict_potential_dictionary[rm_id]
         else:
             #If there is no single match, we can have several options...
             if (len(geni_matches) == 0) and (not conflict_match):
                 self.non_matched_profiles_rm[rm_id] = kind_of_match
                 print_out("-    NO MATCH of profile in " + self.database_geni.get_db_kind() + " " +
                           str(profile_rm.nameLifespan()) + " Relation = " + kind_of_match)
             #Or we have more than one match... that is a conflict
             elif len(geni_matches) > 1:
                 self._conflict_storing(profile_rm, geni_matches, self.database_geni)
     #We perform another loop with those profiles which were conflicted as some of the might have been identified on the other side
     temp_conflicted = conflict_potential_dictionary.copy()
     for rm_id_conflicted in temp_conflicted.keys():
         #We might have some profiles that have been already identified but stored as potential conflicts. We shall remove them
         for geni_conflict_key in temp_conflicted[rm_id_conflicted]:
             if geni_conflict_key in self.matched_profiles.values(): conflict_potential_dictionary[rm_id_conflicted].remove(geni_conflict_key)
     #The profiles left, are either a match, or we have found again gaps of profiles not found
     for rm_id_final_confliced in conflict_potential_dictionary.keys():
         #If the list in the dictionary is empty, we do have a missing proifle
         profile_rm_new = self.database.get_profile_by_ID(rm_id_final_confliced)
         if conflict_potential_dictionary[rm_id_final_confliced] == []:
             #This profile was having a potential conflict that ended being actually an empty profile, it is a missing match
             self.non_matched_profiles_rm[rm_id_final_confliced] = kind_of_match
             print_out("-    NO MATCH of profile in " + self.database_geni.get_db_kind() + " " +
                           str(profile_rm_new.nameLifespan()) + " Relation = " + kind_of_match)
         else:
             #Ok, in this case we have a potential match with an actual conflict
             details_info = "-    CONFLICT POTENTIAL MATCH " + str(profile_rm_new.nameLifespan()) + " with the following: "
             address_list = []
             for profile_id in conflict_potential_dictionary[rm_id_final_confliced]:
                 #We remove conflicted profiles from the matching step
                 if profile_id in profiles_not_identified: profiles_not_identified.remove(profile_id)
                 profile = self.database_geni.get_profile_by_ID(profile_id)
                 address_list.append(profile.get_this_profile_url())
                 details_info += str(profile.nameLifespan()) + " Relation = " + kind_of_match
             include_task_no_duplicate(profile_rm_new, MATCH_CONFLICT_TASK, 1, details_info)
             print_out(details_info)
             self.conflict_profiles[rm_id_final_confliced] = address_list
     #Now, we are able to detect those children on the "RIGHT" side. Not linked to other
     if len(profiles_not_identified) > 0:
         for missing_prof in profiles_not_identified:
             self.non_matched_profiles_geni[missing_prof] = kind_of_match
             prof = self.database_geni.get_profile_by_ID(missing_prof)
             print_out("-    NO MATCH of profile in " + self.database.get_db_kind() + " " + str(prof.nameLifespan()) + " Relation = " + kind_of_match)
 def _track_2_lists(self, profiles_rm, profiles_geni, kind_of_match):
     '''
     Function used for both partners and children as is a common function
     '''
     #We store here the profiles that have been identified in profiles_geni
     profiles_not_identified = list(profiles_geni)
     #We will create here the dictionary of the addresses
     dict_address = {}
     total_prof = self.database_geni.get_several_profile_by_ID(profiles_not_identified)
     for prof in total_prof:
         dict_address[total_prof[prof].get_this_profile_url()] = total_prof[prof].get_id()
     conflict_potential_dictionary = {}
     for rm_id in profiles_rm:
         #We might be in a situation where very small similarities might create confusion of profiles, we store the previous score
         previous_score = 0
         conflict_match = False
         geni_matches = []
         profile_rm = self.database.get_profile_by_ID(rm_id)
         url_rm_now = profile_rm.get_specific_web(self.database_geni.get_db_kind()).get("url", None)
         if url_rm_now in dict_address:
             #In this case we  have a match, and potentially will be the same profile, we avoid several checks by using one
             profile_geni = total_prof[dict_address[url_rm_now]]
             score, factor = profile_rm.comparison_score(profile_geni, self.data_language, self.name_convention)
             if score*factor > self.threshold:
                 #We confirm that this link is correct, so we avoid doing the complete loop...
                 geni_matches = [dict_address[url_rm_now]]
                 if dict_address[url_rm_now] in profiles_not_identified: profiles_not_identified.remove(dict_address[url_rm_now])
         #We only continue if the check of the url was not sucecssful 
         if len(geni_matches) == 0:
             for geni_id in list(profiles_not_identified):
                 #We have already obtained all profiles above
                 profile_geni = total_prof[geni_id]
                 score, factor = profile_rm.comparison_score(profile_geni, self.data_language, self.name_convention)
                 if score*factor > self.threshold:
                     #OPTIONS:
                     # 1.New profile is the right one
                     # 2.New profile is the first one
                     # 3.New profile is not the right one.but is the previous
                     # 4.New profile is as bad as the others.
                     #Option 1
                     if score*factor > 2.5*previous_score:
                         recover_profs = list(geni_matches)
                         geni_matches = [geni_id]
                         profiles_not_identified += recover_profs
                         previous_score = score*factor
                         if geni_id in profiles_not_identified: profiles_not_identified.remove(geni_id)
                     #Option 2
                     elif len(geni_matches) == 0:
                         geni_matches.append(geni_id)
                         previous_score = score*factor
                         if geni_id in profiles_not_identified: profiles_not_identified.remove(geni_id)
                     #Option 3
                     elif previous_score >= 2.5*score*factor:
                         #In this case we ignore... we keep the previous one
                         pass
                     #Option 4
                     else:
                         if score*factor > previous_score: previous_score = score*factor
                         if geni_id in profiles_not_identified: profiles_not_identified.remove(geni_id)
                 elif score >= 3*self.threshold:
                     conflict_match = True
                     #This is a common case, where profiles have a minimum difference but still relevant, user to check
                     if rm_id in conflict_potential_dictionary:
                         conflict_potential_dictionary[rm_id].append(geni_id)
                     else:
                         conflict_potential_dictionary[rm_id] = [geni_id]
         #Options in place with the current match of url
         # No existing url => we ignore
         # Existing so...
         #  - The match is the same. => NO ACTION. Covered by code
         #  - The match is different => NO ACTION. Covered by code double match will be created with warning
         #  - There is no match => ACTION. Include a conflict warning
         #
         #If there is a single match, whatever other conditions, we introduce as a match
         if len(geni_matches) == 1:
             self._match_single_pair(profile_rm, self.database_geni.get_profile_by_ID(geni_matches[0]))
             #In case there has been found also a conflict, the conflict is no longer needed, as we do have a match.
             if rm_id in conflict_potential_dictionary: del conflict_potential_dictionary[rm_id]
         else:
             #If there is no single match, we can have several options...
             if (len(geni_matches) == 0) and (not conflict_match):
                 if url_rm_now and (len(geni_matches) == 0):                  
                     #This is the only option where we are going to generate a conflict, as existing before!
                     print_out("-    CONFLICT of profile " + str(profile_rm.nameLifespan()) + MATCH_PREVIOUS_MATCH)
                     details = MATCH_CONFLICT_URL_MESSAGE + self.current_match + " as " + kind_of_match
                     include_task_no_duplicate(profile_rm, MATCH_CONFLICT_URL_EXISTING, 1, details)
                     #We move the profile to conflicted ones
                     self.conflict_profiles[profile_rm.get_id()] = []
                 else:
                     self.non_matched_profiles_rm[rm_id] = kind_of_match
                     print_out("-    NO MATCH of profile in " + self.database_geni.get_db_kind() + " " +
                           str(profile_rm.nameLifespan()) + " Relation = " + kind_of_match)
             #Or we have more than one match... that is a conflict
             elif len(geni_matches) > 1:
                 self._conflict_storing(profile_rm, geni_matches, self.database_geni)
     #We perform another loop with those profiles which were conflicted as some of the might have been identified on the other side
     temp_conflicted = conflict_potential_dictionary.copy()
     for rm_id_conflicted in temp_conflicted.keys():
         #We might have some profiles that have been already identified but stored as potential conflicts. We shall remove them
         for geni_conflict_key in temp_conflicted[rm_id_conflicted]:
             if geni_conflict_key in self.matched_profiles.values(): conflict_potential_dictionary[rm_id_conflicted].remove(geni_conflict_key)
     #The profiles left, are either a match, or we have found again gaps of profiles not found
     for rm_id_final_confliced in conflict_potential_dictionary.keys():
         #If the list in the dictionary is empty, we do have a missing proifle
         profile_rm_new = self.database.get_profile_by_ID(rm_id_final_confliced)
         if conflict_potential_dictionary[rm_id_final_confliced] == []:
             #This profile was having a potential conflict that ended being actually an empty profile, it is a missing match
             self.non_matched_profiles_rm[rm_id_final_confliced] = kind_of_match
             print_out("-    NO MATCH of profile in " + self.database_geni.get_db_kind() + " " +
                           str(profile_rm_new.nameLifespan()) + " Relation = " + kind_of_match)
         else:
             #Ok, in this case we have a potential match with an actual conflict
             details_info = "-    CONFLICT POTENTIAL MATCH " + str(profile_rm_new.nameLifespan()) + " with the following: "
             address_list = []
             for profile_id in conflict_potential_dictionary[rm_id_final_confliced]:
                 #We remove conflicted profiles from the matching step
                 if profile_id in profiles_not_identified: profiles_not_identified.remove(profile_id)
                 profile = self.database_geni.get_profile_by_ID(profile_id)
                 address_list.append(profile.get_this_profile_url())
                 details_info += str(profile.nameLifespan()) + " Relation = " + kind_of_match
             include_task_no_duplicate(profile_rm_new, MATCH_CONFLICT_TASK, 1, details_info)
             print_out(details_info)
             self.conflict_profiles[rm_id_final_confliced] = address_list
     #Now, we are able to detect those children on the "RIGHT" side. Not linked to other
     if len(profiles_not_identified) > 0:
         for missing_prof in profiles_not_identified:
             self.non_matched_profiles_geni[missing_prof] = kind_of_match
             prof = self.database_geni.get_profile_by_ID(missing_prof)
             print_out("-    NO MATCH of profile in " + self.database.get_db_kind() + " " + str(prof.nameLifespan()) + " Relation = " + kind_of_match)
 def execute_sync(self,
                  profiles2analyze="all",
                  threshold=360,
                  storage=False):
     '''
     This is the core function, it will execute the global sync of profiles Between
     primary and secondary database
     '''
     list_prof = self.dbp.get_all_profiles()
     if profiles2analyze != "all":
         list_prof = self.dbp.get_several_profile_by_ID(
             profiles2analyze).values()
     kind_match = self.dbs.get_db_kind()
     match_str = str(kind_match) + MATCH
     for prof in list_prof:
         if prof.get_specific_web(kind_match) and continue_execution_step(
                 prof, match_str, STATUS_SYNC, threshold=threshold):
             #Obtain the secondary database profile needed
             prof_second = self.dbs.get_profile_by_ID(
                 prof.get_specific_web(kind_match)["url"])
             #We obtain the latest update time
             update_primary = (datetime.now() - prof.get_update_datetime())
             update_secondary = (datetime.now() -
                                 prof_second.get_update_datetime())
             minimum_diff = min(update_primary.days, update_secondary.days)
             #Now, we only review in case the modification date is recent or there has not been any review before
             if (prof.get_research_item_by_name(match_str) is
                     None) or minimum_diff < threshold:
                 #We inform by the command line that we are analyzing one profile
                 print_out(str(prof.get_id()) + " : " + prof.nameLifespan())
                 #We check all the events of the profile in the dictionary
                 events_primary = prof.getEventsDict()
                 events_secondary = prof_second.getEventsDict()
                 events_in_both = set(events_primary) & set(
                     events_secondary)
                 events_only_in_primary = set(events_primary).difference(
                     set(events_secondary))
                 events_only_in_secondary = set(
                     events_secondary).difference(set(events_primary))
                 sync_data = {
                     "PRIM": {
                         "prof_destination": prof,
                         "events2introduce": events_only_in_secondary,
                         "events_dict": events_secondary,
                         "db_destiny": self.dbp
                     },
                     "SEC": {
                         "prof_destination": prof_second,
                         "events2introduce": events_only_in_primary,
                         "events_dict": events_primary,
                         "db_destiny": self.dbs
                     }
                 }
                 for id_sync in sync_data:
                     #As we know, we iterate on those events we know we need to intoduce
                     for event_id in sync_data[id_sync]["events2introduce"]:
                         if event_id in ARRAY_EVENTS:
                             print_out("NOT IMPLEMENTED FOR " + event_id)
                         else:
                             #We obtain the new event class
                             print_out(SYNC_NEW_EVENT + event_id +
                                       SYNC_IN_DB + sync_data[id_sync]
                                       ["db_destiny"].get_db_kind())
                             event_new = sync_data[id_sync]["events_dict"][
                                 event_id]
                             sync_data[id_sync][
                                 "prof_destination"].setNewEvent(event_new)
                 #We store teh exercise performed
                 loc_research = get_research_log_id(prof, storage=storage)
                 today = datetime.now().toordinal()
                 notes_toadd = STATUS_SYNC + str(today)
                 record_research_log(prof, match_str, loc_research, "",
                                     notes_toadd)