def test_deduplicate_by_key(self): collection_of_dicts = [ {'index': 'one', 'other': '1'}, {'index': 'two', 'other': '2'}, {'index': 'two', 'other': '3'}, ] no_duplicates = deduplicate_by_key(collection_of_dicts, lambda r: r['index']) expected = collection_of_dicts[0:2] assert_equal(sorted(no_duplicates), expected)
def test_deduplicate_by_key(self): collection_of_dicts = [ { 'index': 'one', 'other': '1' }, { 'index': 'two', 'other': '2' }, { 'index': 'two', 'other': '3' }, ] no_duplicates = deduplicate_by_key(collection_of_dicts, lambda r: r['index']) expected = collection_of_dicts[0:2] assert_equal(sorted(no_duplicates), expected)
def validate_records(self, session, cohort): """ Fetches the wiki_user(s) already added for self.cohort_id and validates their mediawiki_username against their stated project as either a user_id or user_name. Once done, sets the valid state and deletes any duplicates. Then, it finishes filling in the data model by inserting corresponding records into the cohort_wiki_users table. This is meant to execute asynchronously on celery Parameters session : an active wikimetrics db session to use cohort : the cohort to validate; must belong to session """ # reset the cohort validation status so it can't be used for reports cohort.validated = False session.execute( WikiUser.__table__.update().values(valid=None).where( WikiUser.validating_cohort == cohort.id ) ) session.execute(CohortWikiUser.__table__.delete().where( CohortWikiUser.cohort_id == cohort.id )) session.commit() wikiusers = session.query(WikiUser) \ .filter(WikiUser.validating_cohort == cohort.id) \ .all() deduplicated = deduplicate_by_key( wikiusers, lambda r: (r.mediawiki_username, r.project) ) wikiusers_by_project = {} for wu in deduplicated: try: normalized_project = normalize_project(wu.project) if normalized_project is None: wu.reason_invalid = 'invalid project: {0}'.format(wu.project) wu.valid = False continue wu.project = normalized_project if wu.project not in wikiusers_by_project: wikiusers_by_project[wu.project] = [] wikiusers_by_project[wu.project].append(wu) # validate bunches of records to update the UI but not kill performance if len(wikiusers_by_project[wu.project]) > 999: validate_users( wikiusers_by_project[wu.project], wu.project, self.validate_as_user_ids ) session.commit() wikiusers_by_project[wu.project] = [] except: continue # validate anything that wasn't big enough for a batch for project, wikiusers in wikiusers_by_project.iteritems(): if len(wikiusers) > 0: validate_users(wikiusers, project, self.validate_as_user_ids) session.commit() unique_and_validated = deduplicate_by_key( deduplicated, lambda r: (r.mediawiki_username, r.project) ) session.execute( CohortWikiUser.__table__.insert(), [ { 'cohort_id' : cohort.id, 'wiki_user_id' : wu.id, } for wu in unique_and_validated ] ) # clean up any duplicate wiki_user records session.execute(WikiUser.__table__.delete().where(and_( WikiUser.validating_cohort == cohort.id, WikiUser.id.notin_([wu.id for wu in unique_and_validated]) ))) cohort.validated = True session.commit()
def validate_records(self, session, cohort): """ Fetches the wiki_user(s) already added for self.cohort_id and validates their raw_id_or_name field against their stated project as either a user_id or user_name. Once done, sets the valid state and deletes any duplicates. Then, it finishes filling in the data model by inserting corresponding records into the cohort_wiki_users table. This is meant to execute asynchronously on celery Parameters session : an active wikimetrics db session to use cohort : the cohort to validate; must belong to session """ # reset the cohort validation status so it can't be used for reports cohort.validated = False session.execute(WikiUserStore.__table__.update().values( valid=None).where(WikiUserStore.validating_cohort == cohort.id)) session.execute(CohortWikiUserStore.__table__.delete().where( CohortWikiUserStore.cohort_id == cohort.id)) session.commit() wikiusers = session.query(WikiUserStore) \ .filter(WikiUserStore.validating_cohort == cohort.id) \ .all() deduplicated = deduplicate_by_key( wikiusers, lambda r: (r.raw_id_or_name, normalize_project(r.project) or r.project)) wikiusers_by_project = {} for wu in deduplicated: normalized_project = normalize_project(wu.project) if normalized_project is None: wu.reason_invalid = 'invalid project: {0}'.format(wu.project) wu.valid = False continue wu.project = normalized_project if wu.project not in wikiusers_by_project: wikiusers_by_project[wu.project] = [] wikiusers_by_project[wu.project].append(wu) # validate bunches of records to update the UI but not kill performance if len(wikiusers_by_project[wu.project]) > 999: validate_users(wikiusers_by_project[wu.project], wu.project, self.validate_as_user_ids) session.commit() wikiusers_by_project[wu.project] = [] # validate anything that wasn't big enough for a batch for project, wikiusers in wikiusers_by_project.iteritems(): if len(wikiusers) > 0: validate_users(wikiusers, project, self.validate_as_user_ids) session.commit() session.execute(CohortWikiUserStore.__table__.insert(), [{ 'cohort_id': cohort.id, 'wiki_user_id': wu.id, } for wu in deduplicated]) # clean up any duplicate wiki_user records session.execute(WikiUserStore.__table__.delete().where( and_(WikiUserStore.validating_cohort == cohort.id, WikiUserStore.id.notin_([wu.id for wu in deduplicated])))) cohort.validated = True session.commit()