def _load_test_dataset(self): import datetime self.db.clear() # Add country with self.db.connect() as session: # Add a country us = Country(code='US', name='United States of America', alpha3='USA') session.add(us) # Add organizations api.add_organization(self.db, 'Example') api.add_domain(self.db, 'Example', 'example.com', is_top_domain=True) api.add_domain(self.db, 'Example', 'example.net', is_top_domain=True) api.add_organization(self.db, 'Bitergia') api.add_domain(self.db, 'Bitergia', 'bitergia.net', is_top_domain=True) api.add_domain(self.db, 'Bitergia', 'bitergia.com', is_top_domain=True) api.add_domain(self.db, 'Bitergia', 'api.bitergia.com', is_top_domain=False) api.add_domain(self.db, 'Bitergia', 'test.bitergia.com', is_top_domain=False) api.add_organization(self.db, 'Unknown') # Add John Smith identity jsmith_uuid = api.add_identity(self.db, 'scm', '*****@*****.**', 'John Smith', 'jsmith') api.add_identity(self.db, 'scm', '*****@*****.**', 'John Smith', uuid=jsmith_uuid) api.edit_profile(self.db, jsmith_uuid, email='*****@*****.**', is_bot=True) # Add Joe Roe identity jroe_uuid = api.add_identity(self.db, 'scm', '*****@*****.**', 'Jane Roe', 'jroe') api.add_identity(self.db, 'scm', '*****@*****.**', uuid=jroe_uuid) api.add_identity(self.db, 'unknown', '*****@*****.**', uuid=jroe_uuid) api.edit_profile(self.db, jroe_uuid, name='Jane Roe', email='*****@*****.**', is_bot=False, country_code='US') # Add unique identity, this one won't have neither identities # nor enrollments api.add_unique_identity(self.db, '0000000000000000000000000000000000000000') # Add enrollments api.add_enrollment(self.db, jsmith_uuid, 'Example') api.add_enrollment(self.db, jroe_uuid, 'Example') api.add_enrollment(self.db, jroe_uuid, 'Bitergia', datetime.datetime(1999, 1, 1), datetime.datetime(2000, 1, 1)) api.add_enrollment(self.db, jroe_uuid, 'Bitergia', datetime.datetime(2006, 1, 1), datetime.datetime(2008, 1, 1)) # Add blacklist api.add_to_matching_blacklist(self.db, '*****@*****.**') api.add_to_matching_blacklist(self.db, 'John Smith')
def load_test_dataset(self): # Add country with self.db.connect() as session: # Add a country us = Country(code='US', name='United States of America', alpha3='USA') session.add(us) api.add_unique_identity(self.db, 'John Smith') api.add_identity(self.db, 'scm', '*****@*****.**', uuid='John Smith') api.add_identity(self.db, 'scm', '*****@*****.**', 'John Smith', uuid='John Smith') api.edit_profile(self.db, 'John Smith', name='John Smith', is_bot=False) api.add_unique_identity(self.db, 'John Doe') api.add_identity(self.db, 'scm', '*****@*****.**', uuid='John Doe') api.edit_profile(self.db, 'John Doe', email='*****@*****.**', is_bot=True, country_code='US') api.add_organization(self.db, 'Example') api.add_enrollment(self.db, 'John Smith', 'Example') api.add_enrollment(self.db, 'John Doe', 'Example') api.add_organization(self.db, 'Bitergia') api.add_enrollment(self.db, 'John Smith', 'Bitergia') api.add_enrollment(self.db, 'John Doe', 'Bitergia', datetime.datetime(1999, 1, 1), datetime.datetime(2000, 1, 1)) api.add_organization(self.db, 'LibreSoft')
def test_valid_identities_already_exist(self): """Check method when an identity already exists but with distinct UUID""" # The identity already exists but with a different UUID uuid = api.add_identity(self.db, 'unknown', email='*****@*****.**') api.add_identity(self.db, source='scm', email='*****@*****.**', name='John Smith', username='******', uuid=uuid) api.edit_profile(self.db, uuid, name='John Smith', is_bot=False, country_code='US') parser = self.get_parser('data/sortinghat_valid.json') code = self.cmd.import_identities(parser) self.assertEqual(code, CMD_SUCCESS) # Check the contents of the registry uids = api.unique_identities(self.db) self.assertEqual(len(uids), 2) # John Smith uid = uids[0] self.assertEqual(uid.uuid, '23fe3a011190a27a7c5cf6f8925de38ff0994d8d') # The profile was not updated because it was already available prf = uid.profile self.assertEqual(prf.uuid, '23fe3a011190a27a7c5cf6f8925de38ff0994d8d') self.assertEqual(prf.name, 'John Smith') self.assertEqual(prf.email, None) self.assertEqual(prf.is_bot, False) self.assertEqual(prf.country_code, 'US') self.assertEqual(prf.country.code, 'US') self.assertEqual(prf.country.name, 'United States of America') ids = self.sort_identities(uid.identities) self.assertEqual(len(ids), 3) id0 = ids[0] self.assertEqual(id0.id, '03e12d00e37fd45593c49a5a5a1652deca4cf302') self.assertEqual(id0.name, 'John Smith') self.assertEqual(id0.email, '*****@*****.**') self.assertEqual(id0.username, 'jsmith') self.assertEqual(id0.source, 'scm') id1 = ids[1] self.assertEqual(id1.id, '23fe3a011190a27a7c5cf6f8925de38ff0994d8d') self.assertEqual(id1.name, None) self.assertEqual(id1.email, '*****@*****.**') self.assertEqual(id1.username, None) self.assertEqual(id1.source, 'unknown') id2 = ids[2] self.assertEqual(id2.id, '75d95d6c8492fd36d24a18bd45d62161e05fbc97') self.assertEqual(id2.name, 'John Smith') self.assertEqual(id2.email, '*****@*****.**') self.assertEqual(id2.username, None) self.assertEqual(id2.source, 'scm')
def test_valid_identities_already_exist(self): """Check method when an identity already exists but with distinct UUID""" # The identity already exists but with a different UUID uuid = api.add_identity(self.db, 'unknown', email='*****@*****.**') api.add_identity(self.db, source='scm', email='*****@*****.**', name='John Smith', username='******', uuid=uuid) api.edit_profile(self.db, uuid, name='John Smith', is_bot=False, country_code='US') parser = self.get_parser(datadir('sortinghat_valid.json')) code = self.cmd.import_identities(parser) self.assertEqual(code, CMD_SUCCESS) # Check the contents of the registry uids = api.unique_identities(self.db) self.assertEqual(len(uids), 2) # John Smith uid = uids[1] self.assertEqual(uid.uuid, '2371a34a0ac65fbd9d631464ee41d583ec0e1e88') # The profile is updated because a new one was given prf = uid.profile self.assertEqual(prf.uuid, '2371a34a0ac65fbd9d631464ee41d583ec0e1e88') self.assertEqual(prf.name, None) self.assertEqual(prf.email, '*****@*****.**') self.assertEqual(prf.gender, 'male') self.assertEqual(prf.gender_acc, 100) self.assertEqual(prf.is_bot, True) self.assertEqual(prf.country, None) ids = self.sort_identities(uid.identities) self.assertEqual(len(ids), 3) id0 = ids[0] self.assertEqual(id0.id, '2371a34a0ac65fbd9d631464ee41d583ec0e1e88') self.assertEqual(id0.name, None) self.assertEqual(id0.email, '*****@*****.**') self.assertEqual(id0.username, None) self.assertEqual(id0.source, 'unknown') id1 = ids[1] self.assertEqual(id1.id, '880b3dfcb3a08712e5831bddc3dfe81fc5d7b331') self.assertEqual(id1.name, 'John Smith') self.assertEqual(id1.email, '*****@*****.**') self.assertEqual(id1.username, None) self.assertEqual(id1.source, 'scm') id2 = ids[2] self.assertEqual(id2.id, 'a9b403e150dd4af8953a52a4bb841051e4b705d9') self.assertEqual(id2.name, 'John Smith') self.assertEqual(id2.email, '*****@*****.**') self.assertEqual(id2.username, 'jsmith') self.assertEqual(id2.source, 'scm')
def test_valid_identities_already_exist(self): """Check method when an identity already exists but with distinct UUID""" # The identity already exists but with a different UUID uuid = api.add_identity(self.db, 'unknown', email='*****@*****.**') api.add_identity(self.db, source='scm', email='*****@*****.**', name='John Smith', username='******', uuid=uuid) api.edit_profile(self.db, uuid, name='John Smith', is_bot=False, country_code='US') parser = self.get_parser('data/sortinghat_valid.json') code = self.cmd.import_identities(parser) self.assertEqual(code, CMD_SUCCESS) # Check the contents of the registry uids = api.unique_identities(self.db) self.assertEqual(len(uids), 2) # John Smith uid = uids[1] self.assertEqual(uid.uuid, '2371a34a0ac65fbd9d631464ee41d583ec0e1e88') # The profile is updated because a new one was given prf = uid.profile self.assertEqual(prf.uuid, '2371a34a0ac65fbd9d631464ee41d583ec0e1e88') self.assertEqual(prf.name, None) self.assertEqual(prf.email, '*****@*****.**') self.assertEqual(prf.is_bot, True) self.assertEqual(prf.country, None) ids = self.sort_identities(uid.identities) self.assertEqual(len(ids), 3) id0 = ids[0] self.assertEqual(id0.id, '2371a34a0ac65fbd9d631464ee41d583ec0e1e88') self.assertEqual(id0.name, None) self.assertEqual(id0.email, '*****@*****.**') self.assertEqual(id0.username, None) self.assertEqual(id0.source, 'unknown') id1 = ids[1] self.assertEqual(id1.id, '880b3dfcb3a08712e5831bddc3dfe81fc5d7b331') self.assertEqual(id1.name, 'John Smith') self.assertEqual(id1.email, '*****@*****.**') self.assertEqual(id1.username, None) self.assertEqual(id1.source, 'scm') id2 = ids[2] self.assertEqual(id2.id, 'a9b403e150dd4af8953a52a4bb841051e4b705d9') self.assertEqual(id2.name, 'John Smith') self.assertEqual(id2.email, '*****@*****.**') self.assertEqual(id2.username, 'jsmith') self.assertEqual(id2.source, 'scm')
def run(self): if self.unify: for algo in self.conf['sh_matching']: kwargs = {'matching': algo, 'fast_matching': True} logger.info( "[sortinghat] Unifying identities using algorithm %s", kwargs['matching']) code = Unify(**self.sh_kwargs).unify(**kwargs) if code != CMD_SUCCESS: logger.error("[sortinghat] Error in unify %s", kwargs) if self.affiliate: # Global enrollments using domains logger.info("[sortinghat] Executing affiliate") code = Affiliate(**self.sh_kwargs).affiliate() if code != CMD_SUCCESS: logger.error("[sortinghat] Error in affiliate %s", kwargs) if self.autoprofile: if not 'sh_autoprofile' in self.conf: logger.info( "[sortinghat] Autoprofile not configured. Skipping.") else: logger.info("[sortinghat] Executing autoprofile: %s", self.conf['sh_autoprofile']) sources = self.conf['sh_autoprofile'] code = AutoProfile(**self.sh_kwargs).autocomplete(sources) if code != CMD_SUCCESS: logger.error("Error in autoprofile %s", kwargs) if self.bots: if not 'sh_bots_names' in self.conf: logger.info( "[sortinghat] Bots name list not configured. Skipping.") else: logger.info("[sortinghat] Marking bots: %s", self.conf['sh_bots_names']) for name in self.conf['sh_bots_names']: # First we need the uuids for the profile name uuids = self.__get_uuids_from_profile_name(name) # Then we can modify the profile setting bot flag profile = {"is_bot": True} for uuid in uuids: api.edit_profile(self.db, uuid, **profile) # For quitting the bot flag - debug feature if 'sh_no_bots_names' in self.conf: logger.info("[sortinghat] Removing Marking bots: %s", self.conf['sh_no_bots_names']) for name in self.conf['sh_no_bots_names']: uuids = self.__get_uuids_from_profile_name(name) profile = {"is_bot": False} for uuid in uuids: api.edit_profile(self.db, uuid, **profile)
def add_identity(cls, db, identity, backend): """ Load and identity list from backend in Sorting Hat """ uuid = None try: uuid = api.add_identity(db, backend, identity['email'], identity['name'], identity['username']) logger.debug("New sortinghat identity %s %s,%s,%s ", uuid, identity['username'], identity['name'], identity['email']) profile = { "name": identity['name'] if identity['name'] else identity['username'], "email": identity['email'] } api.edit_profile(db, uuid, **profile) except AlreadyExistsError as ex: uuid = ex.eid except InvalidValueError as ex: logger.warning("Trying to add a None identity. Ignoring it.") except UnicodeEncodeError as ex: logger.warning("UnicodeEncodeError. Ignoring it. %s %s %s", identity['email'], identity['name'], identity['username']) except Exception as ex: logger.warning( "Unknown exception adding identity. Ignoring it. %s %s %s", identity['email'], identity['name'], identity['username'], exc_info=True) if 'company' in identity and identity['company'] is not None: try: api.add_organization(db, identity['company']) api.add_enrollment(db, uuid, identity['company'], datetime(1900, 1, 1), datetime(2100, 1, 1)) except AlreadyExistsError: pass return uuid
def add_identity(cls, db, identity, backend): """ Load and identity list from backend in Sorting Hat """ uuid = None try: uuid = api.add_identity(db, backend, identity['email'], identity['name'], identity['username']) logger.debug("New sortinghat identity %s %s,%s,%s ", uuid, identity['username'], identity['name'], identity['email']) profile = {"name": identity['name'] if identity['name'] else identity['username'], "email": identity['email']} api.edit_profile(db, uuid, **profile) except AlreadyExistsError as ex: uuid = ex.uuid except WrappedValueError as ex: logger.warning("Trying to add a None identity. Ignoring it.") except UnicodeEncodeError as ex: logger.warning("UnicodeEncodeError. Ignoring it. %s %s %s", identity['email'], identity['name'], identity['username']) except Exception as ex: logger.warning("Unknown exception adding identity. Ignoring it. %s %s %s", identity['email'], identity['name'], identity['username']) traceback.print_exc() if 'company' in identity and identity['company'] is not None: try: api.add_organization(db, identity['company']) api.add_enrollment(db, uuid, identity['company'], datetime(1900, 1, 1), datetime(2100, 1, 1)) except AlreadyExistsError: pass return uuid
def load_test_dataset(self): # Add identities jroe_uuid = api.add_identity(self.db, 'scm', '*****@*****.**', 'Jane', 'jroe') api.edit_profile(self.db, jroe_uuid, name="Jane Roe") jrae_uuid = api.add_identity(self.db, 'scm', '*****@*****.**', 'Jane', 'jrae') api.edit_profile(self.db, jrae_uuid, name="Jane R", gender="unknown") jsmith_uuid = api.add_identity(self.db, 'mls', '*****@*****.**', 'John Smith', 'jsmith') api.edit_profile(self.db, jsmith_uuid, name="John Smith") jdoe_uuid = api.add_identity(self.db, 'scm', '*****@*****.**', 'John D', 'jdoe') api.edit_profile(self.db, jdoe_uuid, name="John D")
def test_valid_identities_with_default_matching(self): """Check insertion, matching and merging of valid data""" # First, insert the identity that will match with one # from the file api.add_organization(self.db, 'Example') uuid = api.add_identity(self.db, 'unknown', email='*****@*****.**') api.add_enrollment(self.db, uuid, 'Example', datetime.datetime(2000, 1, 1, 0, 0), datetime.datetime(2100, 1, 1, 0, 0)) api.edit_profile(self.db, uuid, name='John Smith', is_bot=False, country_code='US') parser = self.get_parser(datadir('sortinghat_valid.json')) code = self.cmd.import_identities(parser, matching='default') self.assertEqual(code, CMD_SUCCESS) # Check the contents of the registry uids = api.unique_identities(self.db) self.assertEqual(len(uids), 2) # Jane Roe uid = uids[0] self.assertEqual(uid.uuid, '17ab00ed3825ec2f50483e33c88df223264182ba') prf = uid.profile self.assertEqual(prf.uuid, '17ab00ed3825ec2f50483e33c88df223264182ba') self.assertEqual(prf.name, 'Jane Roe') self.assertEqual(prf.email, '*****@*****.**') self.assertEqual(prf.gender, None) self.assertEqual(prf.gender_acc, None) self.assertEqual(prf.is_bot, False) self.assertEqual(prf.country_code, 'US') self.assertEqual(prf.country.alpha3, 'USA') self.assertEqual(prf.country.code, 'US') self.assertEqual(prf.country.name, 'United States of America') ids = self.sort_identities(uid.identities) self.assertEqual(len(ids), 3) id0 = ids[0] self.assertEqual(id0.id, '17ab00ed3825ec2f50483e33c88df223264182ba') self.assertEqual(id0.name, 'Jane Roe') self.assertEqual(id0.email, '*****@*****.**') self.assertEqual(id0.username, 'jroe') self.assertEqual(id0.source, 'scm') id1 = ids[1] self.assertEqual(id1.id, '22d1b20763c6f5822bdda8508957486c547bb9de') self.assertEqual(id1.name, None) self.assertEqual(id1.email, '*****@*****.**') self.assertEqual(id1.username, None) self.assertEqual(id1.source, 'unknown') id2 = ids[2] self.assertEqual(id2.id, '322397ed782a798ffd9d0bc7e293df4292fe075d') self.assertEqual(id2.name, None) self.assertEqual(id2.email, '*****@*****.**') self.assertEqual(id2.username, None) self.assertEqual(id2.source, 'scm') enrollments = api.enrollments(self.db, uid.uuid) self.assertEqual(len(enrollments), 3) # John Smith uid = uids[1] self.assertEqual(uid.uuid, '2371a34a0ac65fbd9d631464ee41d583ec0e1e88') ids = self.sort_identities(uid.identities) self.assertEqual(len(ids), 3) # The profile was merged prf = uid.profile self.assertEqual(prf.uuid, '2371a34a0ac65fbd9d631464ee41d583ec0e1e88') self.assertEqual(prf.name, 'John Smith') self.assertEqual(prf.email, '*****@*****.**') self.assertEqual(prf.gender, 'male') self.assertEqual(prf.gender_acc, 100) self.assertEqual(prf.is_bot, True) self.assertEqual(prf.country_code, 'US') self.assertEqual(prf.country.code, 'US') self.assertEqual(prf.country.name, 'United States of America') id0 = ids[0] self.assertEqual(id0.id, '2371a34a0ac65fbd9d631464ee41d583ec0e1e88') self.assertEqual(id0.name, None) self.assertEqual(id0.email, '*****@*****.**') self.assertEqual(id0.username, None) self.assertEqual(id0.source, 'unknown') id1 = ids[1] self.assertEqual(id1.id, '880b3dfcb3a08712e5831bddc3dfe81fc5d7b331') self.assertEqual(id1.name, 'John Smith') self.assertEqual(id1.email, '*****@*****.**') self.assertEqual(id1.username, None) self.assertEqual(id1.source, 'scm') id2 = ids[2] self.assertEqual(id2.id, 'a9b403e150dd4af8953a52a4bb841051e4b705d9') self.assertEqual(id2.name, 'John Smith') self.assertEqual(id2.email, '*****@*****.**') self.assertEqual(id2.username, 'jsmith') self.assertEqual(id2.source, 'scm') # Enrollments were merged enrollments = api.enrollments(self.db, uid.uuid) self.assertEqual(len(enrollments), 1) rol0 = enrollments[0] self.assertEqual(rol0.organization.name, 'Example') self.assertEqual(rol0.start, datetime.datetime(2000, 1, 1, 0, 0)) self.assertEqual(rol0.end, datetime.datetime(2100, 1, 1, 0, 0))
def add_identities(cls, db, identities, backend): """ Load identities list from backend in Sorting Hat """ merge_identities = False logger.info("Adding the identities to SortingHat") if not merge_identities: logger.info("Not doing identities merge") total = 0 lidentities = len(identities) if merge_identities: merged_identities = [] # old identities merged into new ones blacklist = api.blacklist(db) matching = 'email-name' # Not active matcher = create_identity_matcher(matching, blacklist) for identity in identities: try: uuid = api.add_identity(db, backend, identity['email'], identity['name'], identity['username']) logger.debug("New sortinghat identity %s %s,%s,%s (%i/%i)", uuid, identity['username'], identity['name'], identity['email'], total, lidentities) profile = { "name": identity['name'] if identity['name'] else identity['username'], "email": identity['email'] } api.edit_profile(db, uuid, **profile) total += 1 if not merge_identities: continue # Don't do the merge here. Too slow in large projects # Time to merge matches = api.match_identities(db, uuid, matcher) if len(matches) > 1: u = api.unique_identities(db, uuid)[0] for m in matches: # First add the old uuid to the list of changed by merge uuids if m.uuid not in merged_identities: merged_identities.append(m.uuid) if m.uuid == uuid: continue # Merge matched identity into added identity api.merge_unique_identities(db, m.uuid, u.uuid) # uuid = m.uuid # u = api.unique_identities(db, uuid, backend)[0] # Include all identities related to this uuid # merged_identities.append(m.uuid) except AlreadyExistsError as ex: uuid = ex.uuid continue except WrappedValueError as ex: logging.warning("Trying to add a None identity. Ignoring it.") continue except UnicodeEncodeError as ex: logging.warning("UnicodeEncodeError. Ignoring it. %s %s %s" % \ (identity['email'], identity['name'], identity['username'])) continue except Exception as ex: logging.warning("Unknown exception adding identity. Ignoring it. %s %s %s" % \ (identity['email'], identity['name'], identity['username'])) traceback.print_exc() continue if 'company' in identity and identity['company'] is not None: try: api.add_organization(db, identity['company']) api.add_enrollment(db, uuid, identity['company'], datetime(1900, 1, 1), datetime(2100, 1, 1)) except AlreadyExistsError: pass logger.info("Total NEW identities: %i" % (total)) if merge_identities: logger.info("Total NEW identities merged: %i" % \ (len(merged_identities))) return merged_identities else: return []
def execute(self): # ** START SYNC LOGIC ** # Check that enrichment tasks are not active before loading identities while True: time.sleep(1) # check each second if the task could start with TasksManager.IDENTITIES_TASKS_ON_LOCK: with TasksManager.NUMBER_ENRICH_TASKS_ON_LOCK: enrich_tasks = TasksManager.NUMBER_ENRICH_TASKS_ON logger.debug("[unify] Enrich tasks active: %i", enrich_tasks) if enrich_tasks == 0: # The load of identities can be started TasksManager.IDENTITIES_TASKS_ON = True break # ** END SYNC LOGIC ** cfg = self.config.get_conf() uuids_refresh = [] for algo in cfg['sortinghat']['matching']: if not algo: # cfg['sortinghat']['matching'] is an empty list logger.debug('Unify not executed because empty algorithm') continue kwargs = { 'matching': algo, 'fast_matching': True, 'strict_mapping': cfg['sortinghat']['strict_mapping'] } logger.info("[sortinghat] Unifying identities using algorithm %s", kwargs['matching']) self.do_unify(kwargs) if not cfg['sortinghat']['affiliate']: logger.debug("Not doing affiliation") else: # Global enrollments using domains logger.info("[sortinghat] Executing affiliate") self.do_affiliate() if 'autoprofile' not in cfg['sortinghat'] or \ not cfg['sortinghat']['autoprofile'][0]: logger.info("[sortinghat] Autoprofile not configured. Skipping.") else: logger.info("[sortinghat] Executing autoprofile for sources: %s", cfg['sortinghat']['autoprofile']) sources = cfg['sortinghat']['autoprofile'] self.do_autoprofile(sources) if 'autogender' not in cfg['sortinghat'] or \ not cfg['sortinghat']['autogender']: logger.info("[sortinghat] Autogender not configured. Skipping.") else: logger.info("[sortinghat] Executing autogender") self.do_autogender() if 'bots_names' not in cfg['sortinghat']: logger.info( "[sortinghat] Bots name list not configured. Skipping.") else: logger.info("[sortinghat] Marking bots: %s", cfg['sortinghat']['bots_names']) for name in cfg['sortinghat']['bots_names']: # First we need the uuids for the profile name uuids = self.__get_uuids_from_profile_name(name) # Then we can modify the profile setting bot flag profile = {"is_bot": True} for uuid in uuids: api.edit_profile(self.db, uuid, **profile) # For quitting the bot flag - debug feature if 'no_bots_names' in cfg['sortinghat']: logger.info("[sortinghat] Removing Marking bots: %s", cfg['sortinghat']['no_bots_names']) for name in cfg['sortinghat']['no_bots_names']: uuids = self.__get_uuids_from_profile_name(name) profile = {"is_bot": False} for uuid in uuids: api.edit_profile(self.db, uuid, **profile) with TasksManager.IDENTITIES_TASKS_ON_LOCK: TasksManager.IDENTITIES_TASKS_ON = False
def test_valid_identities_with_default_matching(self): """Check insertion, matching and merging of valid data""" # First, insert the identity that will match with one # from the file api.add_organization(self.db, 'Example') uuid = api.add_identity(self.db, 'unknown', email='*****@*****.**') api.add_enrollment(self.db, uuid, 'Example', datetime.datetime(2000, 1, 1, 0, 0), datetime.datetime(2100, 1, 1, 0, 0)) api.edit_profile(self.db, uuid, name='John Smith', is_bot=False, country_code='US') parser = self.get_parser('data/sortinghat_valid.json') code = self.cmd.import_identities(parser, matching='default') self.assertEqual(code, CMD_SUCCESS) # Check the contents of the registry uids = api.unique_identities(self.db) self.assertEqual(len(uids), 2) # John Smith uid = uids[0] self.assertEqual(uid.uuid, '23fe3a011190a27a7c5cf6f8925de38ff0994d8d') ids = self.sort_identities(uid.identities) self.assertEqual(len(ids), 3) # The profile was merged prf = uid.profile self.assertEqual(prf.uuid, '23fe3a011190a27a7c5cf6f8925de38ff0994d8d') self.assertEqual(prf.name, 'John Smith') self.assertEqual(prf.email, '*****@*****.**') self.assertEqual(prf.is_bot, True) self.assertEqual(prf.country_code, 'US') self.assertEqual(prf.country.code, 'US') self.assertEqual(prf.country.name, 'United States of America') id0 = ids[0] self.assertEqual(id0.id, '03e12d00e37fd45593c49a5a5a1652deca4cf302') self.assertEqual(id0.name, 'John Smith') self.assertEqual(id0.email, '*****@*****.**') self.assertEqual(id0.username, 'jsmith') self.assertEqual(id0.source, 'scm') id1 = ids[1] self.assertEqual(id1.id, '23fe3a011190a27a7c5cf6f8925de38ff0994d8d') self.assertEqual(id1.name, None) self.assertEqual(id1.email, '*****@*****.**') self.assertEqual(id1.username, None) self.assertEqual(id1.source, 'unknown') id2 = ids[2] self.assertEqual(id2.id, '75d95d6c8492fd36d24a18bd45d62161e05fbc97') self.assertEqual(id2.name, 'John Smith') self.assertEqual(id2.email, '*****@*****.**') self.assertEqual(id2.username, None) self.assertEqual(id2.source, 'scm') # Enrollments were merged enrollments = api.enrollments(self.db, uid.uuid) self.assertEqual(len(enrollments), 1) rol0 = enrollments[0] self.assertEqual(rol0.organization.name, 'Example') self.assertEqual(rol0.start, datetime.datetime(2000, 1, 1, 0, 0)) self.assertEqual(rol0.end, datetime.datetime(2100, 1, 1, 0, 0)) # Jane Roe uid = uids[1] self.assertEqual(uid.uuid, '52e0aa0a14826627e633fd15332988686b730ab3') prf = uid.profile self.assertEqual(prf.uuid, '52e0aa0a14826627e633fd15332988686b730ab3') self.assertEqual(prf.name, 'Jane Roe') self.assertEqual(prf.email, '*****@*****.**') self.assertEqual(prf.is_bot, False) self.assertEqual(prf.country_code, 'US') self.assertEqual(prf.country.alpha3, 'USA') self.assertEqual(prf.country.code, 'US') self.assertEqual(prf.country.name, 'United States of America') ids = self.sort_identities(uid.identities) self.assertEqual(len(ids), 3) id0 = ids[0] self.assertEqual(id0.id, '52e0aa0a14826627e633fd15332988686b730ab3') self.assertEqual(id0.name, 'Jane Roe') self.assertEqual(id0.email, '*****@*****.**') self.assertEqual(id0.username, 'jroe') self.assertEqual(id0.source, 'scm') id1 = ids[1] self.assertEqual(id1.id, 'cbfb7bd31d556322c640f5bc7b31d58a12b15904') self.assertEqual(id1.name, None) self.assertEqual(id1.email, '*****@*****.**') self.assertEqual(id1.username, None) self.assertEqual(id1.source, 'unknown') id2 = ids[2] self.assertEqual(id2.id, 'fef873c50a48cfc057f7aa19f423f81889a8907f') self.assertEqual(id2.name, None) self.assertEqual(id2.email, '*****@*****.**') self.assertEqual(id2.username, None) self.assertEqual(id2.source, 'scm') enrollments = api.enrollments(self.db, uid.uuid) self.assertEqual(len(enrollments), 3)
def add_identities(cls, db, identities, backend): """ Load identities list from backend in Sorting Hat """ merge_identities = False logger.info("Adding the identities to SortingHat") if not merge_identities: logger.info("Not doing identities merge") total = 0 lidentities = len(identities) if merge_identities: merged_identities = [] # old identities merged into new ones blacklist = api.blacklist(db) matching = 'email-name' # Not active matcher = create_identity_matcher(matching, blacklist) for identity in identities: try: uuid = api.add_identity(db, backend, identity['email'], identity['name'], identity['username']) logger.debug("New sortinghat identity %s %s,%s,%s (%i/%i)", uuid, identity['username'], identity['name'], identity['email'], total, lidentities) profile = {"name": identity['name'] if identity['name'] else identity['username'], "email": identity['email']} api.edit_profile(db, uuid, **profile) total += 1 if not merge_identities: continue # Don't do the merge here. Too slow in large projects # Time to merge matches = api.match_identities(db, uuid, matcher) if len(matches) > 1: u = api.unique_identities(db, uuid)[0] for m in matches: # First add the old uuid to the list of changed by merge uuids if m.uuid not in merged_identities: merged_identities.append(m.uuid) if m.uuid == uuid: continue # Merge matched identity into added identity api.merge_unique_identities(db, m.uuid, u.uuid) # uuid = m.uuid # u = api.unique_identities(db, uuid, backend)[0] # Include all identities related to this uuid # merged_identities.append(m.uuid) except AlreadyExistsError as ex: uuid = ex.uuid continue except WrappedValueError as ex: logging.warning("Trying to add a None identity. Ignoring it.") continue except UnicodeEncodeError as ex: logging.warning("UnicodeEncodeError. Ignoring it. %s %s %s" % \ (identity['email'], identity['name'], identity['username'])) continue except Exception as ex: logging.warning("Unknown exception adding identity. Ignoring it. %s %s %s" % \ (identity['email'], identity['name'], identity['username'])) traceback.print_exc() continue if 'company' in identity and identity['company'] is not None: try: api.add_organization(db, identity['company']) api.add_enrollment(db, uuid, identity['company'], datetime(1900, 1, 1), datetime(2100, 1, 1)) except AlreadyExistsError: pass logger.info("Total NEW identities: %i" % (total)) if merge_identities: logger.info("Total NEW identities merged: %i" % \ (len(merged_identities))) return merged_identities else: return []
def execute(self): # ** START SYNC LOGIC ** # Check that enrichment tasks are not active before loading identities while True: time.sleep(1) # check each second if the task could start with TasksManager.IDENTITIES_TASKS_ON_LOCK: with TasksManager.NUMBER_ENRICH_TASKS_ON_LOCK: enrich_tasks = TasksManager.NUMBER_ENRICH_TASKS_ON logger.debug("[unify] Enrich tasks active: %i", enrich_tasks) if enrich_tasks == 0: # The load of identities can be started TasksManager.IDENTITIES_TASKS_ON = True break # ** END SYNC LOGIC ** cfg = self.config.get_conf() uuids_refresh = [] for algo in cfg['sortinghat']['matching']: if not algo: # cfg['sortinghat']['matching'] is an empty list logger.debug('Unify not executed because empty algorithm') continue kwargs = {'matching': algo, 'fast_matching': True, 'strict_mapping': cfg['sortinghat']['strict_mapping']} logger.info("[sortinghat] Unifying identities using algorithm %s", kwargs['matching']) self.do_unify(kwargs) if not cfg['sortinghat']['affiliate']: logger.debug("Not doing affiliation") else: # Global enrollments using domains logger.info("[sortinghat] Executing affiliate") self.do_affiliate() if 'autoprofile' not in cfg['sortinghat'] or \ not cfg['sortinghat']['autoprofile'][0]: logger.info("[sortinghat] Autoprofile not configured. Skipping.") else: logger.info("[sortinghat] Executing autoprofile for sources: %s", cfg['sortinghat']['autoprofile']) sources = cfg['sortinghat']['autoprofile'] self.do_autoprofile(sources) if 'autogender' not in cfg['sortinghat'] or \ not cfg['sortinghat']['autogender']: logger.info("[sortinghat] Autogender not configured. Skipping.") else: logger.info("[sortinghat] Executing autogender") self.do_autogender() if 'bots_names' not in cfg['sortinghat']: logger.info("[sortinghat] Bots name list not configured. Skipping.") else: logger.info("[sortinghat] Marking bots: %s", cfg['sortinghat']['bots_names']) for name in cfg['sortinghat']['bots_names']: # First we need the uuids for the profile name uuids = self.__get_uuids_from_profile_name(name) # Then we can modify the profile setting bot flag profile = {"is_bot": True} for uuid in uuids: api.edit_profile(self.db, uuid, **profile) # For quitting the bot flag - debug feature if 'no_bots_names' in cfg['sortinghat']: logger.info("[sortinghat] Removing Marking bots: %s", cfg['sortinghat']['no_bots_names']) for name in cfg['sortinghat']['no_bots_names']: uuids = self.__get_uuids_from_profile_name(name) profile = {"is_bot": False} for uuid in uuids: api.edit_profile(self.db, uuid, **profile) with TasksManager.IDENTITIES_TASKS_ON_LOCK: TasksManager.IDENTITIES_TASKS_ON = False
def test_retry(self): """Test if a name is skipped when a connection error is returned""" http_requests = setup_genderize_server() # This profile won't be updated due to connection errors # In this case, a 502 HTTP error uuid = api.add_identity(self.db, 'scm', '*****@*****.**', 'Error Name') api.edit_profile(self.db, uuid, name="Error Name") # Tests self.cmd.autogender(api_token='abcdefghi') uids = api.unique_identities(self.db) prf = uids[0].profile self.assertEqual(prf.uuid, '2a9ec221b8dd5d5a85ae0e3276b8b2c3618ee15e') self.assertEqual(prf.gender, 'female') self.assertEqual(prf.gender_acc, 100) # Error Name profile was not updated due to connection errors prf = uids[1].profile self.assertEqual(prf.uuid, '316b78ff088c2a825defacb802013fa670fccb48') self.assertEqual(prf.gender, None) self.assertEqual(prf.gender_acc, None) # Jane Rae gender is not updated because it was already set prf = uids[2].profile self.assertEqual(prf.uuid, '3e1eccdb1e52ea56225f419d3e532fe9133c7821') self.assertEqual(prf.gender, 'unknown') self.assertEqual(prf.gender_acc, 100) prf = uids[3].profile self.assertEqual(prf.uuid, '539acca35c2e8502951a97d2d5af8b0857440b50') self.assertEqual(prf.gender, 'male') self.assertEqual(prf.gender_acc, 99) prf = uids[4].profile self.assertEqual(prf.uuid, 'a39ac334be9f17bfc7f9f21bbb25f389388f8e18') self.assertEqual(prf.gender, 'male') self.assertEqual(prf.gender_acc, 99) expected = [ { 'name': ['jane'], 'apikey': ['abcdefghi'] }, { 'name': ['error'], 'apikey': ['abcdefghi'] }, { 'name': ['john'], 'apikey': ['abcdefghi'] } ] self.assertEqual(len(http_requests), 8) req = http_requests[0] self.assertEqual(req.method, 'GET') self.assertEqual(req.querystring, expected[0]) for i in range(1, 7): req = http_requests[i] self.assertEqual(req.method, 'GET') self.assertEqual(req.querystring, expected[1]) req = http_requests[7] self.assertEqual(req.method, 'GET') self.assertEqual(req.querystring, expected[2])
def test_autogender_ignore_name_not_well_formed(self): """Test if no gender is set when a name is invalid""" http_requests = setup_genderize_server() # These names are invalid so they will be ignored uuid = api.add_identity(self.db, 'scm', '*****@*****.**', 'Random Name') api.edit_profile(self.db, uuid, name="r4nd0m") uuid = api.add_identity(self.db, 'mls', '*****@*****.**', 'Another Random Name') api.edit_profile(self.db, uuid, name="ARadomName") self.cmd.autogender(api_token='abcdefghi') uids = api.unique_identities(self.db) prf = uids[0].profile self.assertEqual(prf.uuid, '2a9ec221b8dd5d5a85ae0e3276b8b2c3618ee15e') self.assertEqual(prf.gender, 'female') self.assertEqual(prf.gender_acc, 100) # Jane Rae gender is not updated because it was already set prf = uids[1].profile self.assertEqual(prf.uuid, '3e1eccdb1e52ea56225f419d3e532fe9133c7821') self.assertEqual(prf.gender, 'unknown') self.assertEqual(prf.gender_acc, 100) prf = uids[2].profile self.assertEqual(prf.uuid, '539acca35c2e8502951a97d2d5af8b0857440b50') self.assertEqual(prf.gender, 'male') self.assertEqual(prf.gender_acc, 99) prf = uids[3].profile self.assertEqual(prf.uuid, 'a39ac334be9f17bfc7f9f21bbb25f389388f8e18') self.assertEqual(prf.gender, 'male') self.assertEqual(prf.gender_acc, 99) # These names were ignored and their profile were not set either prf = uids[4].profile self.assertEqual(prf.uuid, 'cfa19ae04ce0c70902a31084fc75086b61ccfcf2') self.assertEqual(prf.gender, None) self.assertEqual(prf.gender_acc, None) prf = uids[5].profile self.assertEqual(prf.uuid, 'ee48da0af80479e81b846bec13fe238c06772701') self.assertEqual(prf.gender, None) self.assertEqual(prf.gender_acc, None) # Check requests. # Only two valid names were checked expected = [ { 'name': ['jane'], 'apikey': ['abcdefghi'] }, { 'name': ['john'], 'apikey': ['abcdefghi'] }, ] self.assertEqual(len(http_requests), len(expected)) for i in range(len(expected)): self.assertDictEqual(http_requests[i].querystring, expected[i])
def test_autogender_name_not_found(self): """Test if no gender is set when a name is not found""" http_requests = setup_genderize_server() # This name won't be found uuid = api.add_identity(self.db, 'scm', '*****@*****.**', 'Random Name') api.edit_profile(self.db, uuid, name="Random Name") self.cmd.autogender(api_token='abcdefghi') uids = api.unique_identities(self.db) prf = uids[0].profile self.assertEqual(prf.uuid, '2a9ec221b8dd5d5a85ae0e3276b8b2c3618ee15e') self.assertEqual(prf.gender, 'female') self.assertEqual(prf.gender_acc, 100) # Jane Rae gender is not updated because it was already set prf = uids[1].profile self.assertEqual(prf.uuid, '3e1eccdb1e52ea56225f419d3e532fe9133c7821') self.assertEqual(prf.gender, 'unknown') self.assertEqual(prf.gender_acc, 100) prf = uids[2].profile self.assertEqual(prf.uuid, '539acca35c2e8502951a97d2d5af8b0857440b50') self.assertEqual(prf.gender, 'male') self.assertEqual(prf.gender_acc, 99) prf = uids[3].profile self.assertEqual(prf.uuid, 'a39ac334be9f17bfc7f9f21bbb25f389388f8e18') self.assertEqual(prf.gender, 'male') self.assertEqual(prf.gender_acc, 99) # This name was not found and the gender was not updated prf = uids[4].profile self.assertEqual(prf.uuid, 'cfa19ae04ce0c70902a31084fc75086b61ccfcf2') self.assertEqual(prf.gender, None) self.assertEqual(prf.gender_acc, None) # Check requests expected = [ { 'name': ['jane'], 'apikey': ['abcdefghi'] }, { 'name': ['john'], 'apikey': ['abcdefghi'] }, { 'name': ['random'], 'apikey': ['abcdefghi'] }, ] self.assertEqual(len(http_requests), len(expected)) for i in range(len(expected)): self.assertDictEqual(http_requests[i].querystring, expected[i])
def execute(self): # ** START SYNC LOGIC ** # Check that enrichment tasks are not active before loading identities while True: time.sleep( 10) # check each 10 seconds if the identities load could start with TasksManager.IDENTITIES_TASKS_ON_LOCK: with TasksManager.NUMBER_ENRICH_TASKS_ON_LOCK: enrich_tasks = TasksManager.NUMBER_ENRICH_TASKS_ON logger.debug("Enrich tasks active: %i", enrich_tasks) if enrich_tasks == 0: # The load of identities can be started TasksManager.IDENTITIES_TASKS_ON = True break # ** END SYNC LOGIC ** cfg = self.config.get_conf() uuids_refresh = [] if self.unify: for algo in cfg['sortinghat']['matching']: if not algo: # cfg['sortinghat']['matching'] is an empty list logger.debug('Unify not executed because empty algorithm') continue kwargs = { 'matching': algo, 'fast_matching': True, 'strict_mapping': cfg['sortinghat']['strict_mapping'] } logger.info( "[sortinghat] Unifying identities using algorithm %s", kwargs['matching']) uuids = self.do_unify(kwargs) uuids_refresh += uuids logger.debug("uuids to refresh from unify: %s", uuids) if self.affiliate: if not cfg['sortinghat']['affiliate']: logger.debug("Not doing affiliation") else: # Global enrollments using domains logger.info("[sortinghat] Executing affiliate") uuids = self.do_affiliate() uuids_refresh += uuids logger.debug("uuids to refresh from affiliate: %s", uuids) if self.autoprofile: # autoprofile = [] -> cfg['sortinghat']['autoprofile'][0] = [''] if ('autoprofile' not in cfg['sortinghat'] or not cfg['sortinghat']['autoprofile'][0]): logger.info( "[sortinghat] Autoprofile not configured. Skipping.") else: logger.info( "[sortinghat] Executing autoprofile for sources: %s", cfg['sortinghat']['autoprofile']) sources = cfg['sortinghat']['autoprofile'] self.do_autoprofile(sources) # The uuids must be refreshed in all backends (data sources) # Give 5s so the queue is filled and if not, continue without it try: autorefresh_backends_uuids = TasksManager.UPDATED_UUIDS_QUEUE.get( timeout=5) for backend_section in autorefresh_backends_uuids: autorefresh_backends_uuids[backend_section] += uuids_refresh TasksManager.UPDATED_UUIDS_QUEUE.put(autorefresh_backends_uuids) logger.debug( "Autorefresh uuids queue after processing identities: %s", autorefresh_backends_uuids) except Empty: logger.warning( "Autorefresh uuids not active because the queue for it is empty." ) if self.bots: if 'bots_names' not in cfg['sortinghat']: logger.info( "[sortinghat] Bots name list not configured. Skipping.") else: logger.info("[sortinghat] Marking bots: %s", cfg['sortinghat']['bots_names']) for name in cfg['sortinghat']['bots_names']: # First we need the uuids for the profile name uuids = self.__get_uuids_from_profile_name(name) # Then we can modify the profile setting bot flag profile = {"is_bot": True} for uuid in uuids: api.edit_profile(self.db, uuid, **profile) # For quitting the bot flag - debug feature if 'no_bots_names' in cfg['sortinghat']: logger.info("[sortinghat] Removing Marking bots: %s", cfg['sortinghat']['no_bots_names']) for name in cfg['sortinghat']['no_bots_names']: uuids = self.__get_uuids_from_profile_name(name) profile = {"is_bot": False} for uuid in uuids: api.edit_profile(self.db, uuid, **profile) # Autorefresh must be done once identities processing has finished # Give 5s so the queue is filled and if not, continue without it try: autorefresh_backends = TasksManager.AUTOREFRESH_QUEUE.get( timeout=5) for backend_section in autorefresh_backends: autorefresh_backends[backend_section] = True TasksManager.AUTOREFRESH_QUEUE.put(autorefresh_backends) logger.debug("Autorefresh queue after processing identities: %s", autorefresh_backends) except Empty: logger.warning( "Autorefresh not active because the queue for it is empty.") with TasksManager.IDENTITIES_TASKS_ON_LOCK: TasksManager.IDENTITIES_TASKS_ON = False