def get_users(self, date_start, date_end, project='enwiki'): """ Returns a Generator for MediaWiki user IDs. """ # @TODO MOVE DB REFS INTO QUERY MODULE params = { 'date_start': format_mediawiki_timestamp(date_start), 'date_end': format_mediawiki_timestamp(date_end), } conn = Connector(instance=settings.PROJECT_DB_MAP[project]) query = sub_tokens(self.QUERY_TYPES[self._query_type], db=escape_var(project)) conn._cur_.execute(query, params) for row in conn._cur_: yield row[0]
def generate_test_cohort(project, max_size=10, write=False, user_interval_size=1, rev_interval_size=7, rev_lower_limit=0): """ Build a test cohort (list of UIDs) for the given project. Parameters ~~~~~~~~~~ project : str Wikipedia project e.g. 'enwiki'. size : uint Number of users to include in the cohort. write: boolean Flag indicating whether to write the cohort to settings.__cohort_meta_db__ and settings.__cohort_db__. user_interval_size: uint Number of days within which to take registered users rev_lower_limit: int Minimum number of revisions a user must have between registration and the Returns the list of UIDs from the corresponding project that defines the test cohort. """ # Determine the time bounds that define the cohort acceptance criteria ts_start_o = datetime.now() + timedelta(days=-60) ts_end_user_o = ts_start_o + timedelta(days=int(user_interval_size)) ts_end_revs_o = ts_start_o + timedelta(days=int(rev_interval_size)) ts_start = format_mediawiki_timestamp(ts_start_o) ts_end_user = format_mediawiki_timestamp(ts_end_user_o) ts_end_revs = format_mediawiki_timestamp(ts_end_revs_o) # Synthesize query and execute logging.info(__name__ + ' :: Getting users from {0}.\n\n' '\tUser interval: {1} - {2}\n' '\tRevision interval: {1} - {3}\n' '\tMax users = {4}\n' '\tMin revs = {5}\n'. format(project, ts_start, ts_end_user, ts_end_revs, max_size, rev_lower_limit ) ) query = sub_tokens(SELECT_PROJECT_IDS, db=escape_var(str(project))) # @TODO MOVE DB REFS INTO QUERY MODULE try: params = { 'ts_start': str(ts_start), 'ts_end_user': str(ts_end_user), 'ts_end_revs': str(ts_end_revs), 'max_size': int(max_size), 'rev_lower_limit': int(rev_lower_limit), } except ValueError as e: raise Exception(__name__ + ' :: Bad params ' + str(e)) conn = Connector(instance=settings.PROJECT_DB_MAP[project]) conn._cur_.execute(query, params) users = [row for row in conn._cur_] del conn # get latest cohort id & cohort name utm_name = generate_test_cohort_name(project) # add new ids to usertags & usertags_meta if write: logging.info(__name__ + ' :: Inserting records...\n\n' '\tCohort name - {0}\n' '\t{2} - {3} record(s)\n'. format(utm_name, settings.__cohort_db__, len(users))) query_mod.add_cohort_data(utm_name, users, project) return users