def run_figures_monthly_metrics(): """ Populate monthly metrics for all sites. """ logger.info('Starting figures.tasks.run_figures_monthly_metrics...') for site in get_sites(): populate_monthly_metrics_for_site.delay(site_id=site.id)
def backfill_figures_ed(): results = dict() for site in get_sites(): print('Backfilling enrollment data for site "{}"'.format(site.domain)) site_ed = backfill_enrollment_data_for_site(site) results[site.id] = site_ed return results
def populate_daily_metrics_next(site_id=None, force_update=False): """Next iteration to collect daily metrics for all sites in a deployment This is a top level Celery task run every 24 hours to update Figures data. * It updates Figures per-enrollment data and collect daily aggregate metrics * It's purpose is to collect new metrics on an ongoing basis and not serve dual purpose of collecting ongoing data AND backfilling data. * The driver for this change is to improve performance of the daily Celery jobs What's different? * Figures collects the enrollment data first, then aggregates daily data. TODO: Draft up public architecture docs and reference them here """ if waffle.switch_is_active(WAFFLE_DISABLE_PIPELINE): logger.warning('Figures pipeline is disabled due to %s being active.', WAFFLE_DISABLE_PIPELINE) return date_for = datetime.datetime.utcnow().date() if site_id is not None: sites = get_sites_by_id((site_id, )) else: sites = get_sites() sites_count = sites.count() # This is our task entry log message msg = '{prefix}:START:date_for={date_for}, site_count={site_count}' logger.info( msg.format(prefix=FPD_LOG_PREFIX, date_for=date_for, site_count=sites_count)) for i, site in enumerate(sites): msg = '{prefix}:SITE:START:{id}:{domain} - Site {i:04d} of {n:04d}' logger.info( msg.format(prefix=FPD_LOG_PREFIX, id=site.id, domain=site.domain, i=i, n=sites_count)) try: populate_daily_metrics_for_site(site_id=site.id, date_for=date_for, ed_next=True, force_update=force_update) except Exception: # pylint: disable=broad-except msg = ('{prefix}:FAIL populate_daily_metrics unhandled site level' ' exception for site[{site_id}]={domain}') logger.exception( msg.format(prefix=FPD_LOG_PREFIX, site_id=site.id, domain=site.domain)) msg = '{prefix}:END:date_for={date_for}, site_count={site_count}' logger.info( msg.format(prefix=FPD_LOG_PREFIX, date_for=date_for, site_count=sites_count))
def populate_all_mau(): """ Top level task to kick off MAU collection Initially, run it every day to observe monthly active user accumulation for the month and evaluate the results """ for site in get_sites(): populate_mau_metrics_for_site(site_id=site.id, force_update=False)
def handle(self, *args, **options): print('BEGIN: Backfill Figures Metrics') if options['site']: sites = [get_site(options['site'])] else: sites = get_sites() for site in sites: backfill_site(site, overwrite=options['overwrite']) print('DONE: Backfill Figures Metrics')
def run_figures_monthly_metrics(): """ Populate monthly metrics for all sites. """ if waffle.switch_is_active(WAFFLE_DISABLE_PIPELINE): logger.info('Figures pipeline is disabled due to %s being active.', WAFFLE_DISABLE_PIPELINE) return logger.info('Starting figures.tasks.run_figures_monthly_metrics...') all_sites_jobs = group(populate_monthly_metrics_for_site.s(site.id) for site in get_sites()) all_sites_jobs.delay()
def handle(self, *args, **options): print('BEGIN: Update Figures EnrollmentData') if options['site']: sites = [get_site(options['site'])] else: sites = get_sites() for site in sites: print('Updating EnrollmentData for site "{}"'.format(site.domain)) if options['no_delay']: update_enrollment_data(site_id=site.id) else: update_enrollment_data.delay( site_id=site.id) # pragma: no cover print('DONE: Update Figures EnrollmentData')
def get_site_ids(self, identifier=None): """Quick-n-dirty function to let the caller choose the site id or domain. If no identifier is passed, return all available Sites. Let the 'get' fail if record can't be found from the identifier. Returns Site ids for passing to Celery tasks. Note that at present, none of the tasks handle more than one specified Site. """ if not identifier: sites = get_sites() else: try: filter_arg = dict(pk=int(identifier)) except ValueError: filter_arg = dict(domain=identifier) sites = Site.objects.filter(**filter_arg) return [site.id for site in sites]
def populate_daily_metrics(site_id=None, date_for=None, force_update=False): """Runs Figures daily metrics collection This is a top level Celery task run every 24 hours to collect metrics. It iterates over each site to populate CourseDailyMetrics records for the courses in each site, then populates that site's SiteDailyMetrics record. Developer note: Errors need to be handled at each layer in the call chain 1. Site 2. Course 3. Learner and for any auxiliary data collection that may be added in the future to this task. Those need to be wrapped in `try/ecxcept` blocks too This function will get reworked so that each site runs in its own """ if waffle.switch_is_active(WAFFLE_DISABLE_PIPELINE): logger.warning('Figures pipeline is disabled due to %s being active.', WAFFLE_DISABLE_PIPELINE) return # The date_for handling is very similar to the new rule we ahve in # `figures.pipeline.helpers.pipeline_data_for_rule` # The difference is the following code does not set 'date_for' as yesterday # So we likely want to rework the pipeline rule function and this code # so that we have a generalized date_for rule that can take an optional # transform function, like `prev_day` today = datetime.datetime.utcnow().replace(tzinfo=utc).date() # TODO: Decide if/how we want any special logging if we get an exception # on 'casting' the date_for argument as a datetime.date object if date_for: date_for = as_date(date_for) if date_for > today: msg = '{prefix}:ERROR - Attempted pipeline call with future date: "{date_for}"' raise DateForCannotBeFutureError(msg.format(prefix=FPD_LOG_PREFIX, date_for=date_for)) # Don't update enrollment data if we are backfilling (loading data for # previous dates) as it is expensive else: date_for = today do_update_enrollment_data = False if date_for < today else True if site_id is not None: sites = get_sites_by_id((site_id, )) else: sites = get_sites() sites_count = sites.count() # This is our task entry log message msg = '{prefix}:START:date_for={date_for}, site_count={site_count}' logger.info(msg.format(prefix=FPD_LOG_PREFIX, date_for=date_for, site_count=sites_count)) if is_past_date(date_for): msg = ('{prefix}:INFO - CourseDailyMetrics.average_progress will not be ' 'calculated for past date {date_for}') logger.info(msg.format(date_for=date_for, prefix=FPD_LOG_PREFIX)) for i, site in enumerate(sites): msg = '{prefix}:SITE:START:{id}:{domain} - Site {i:04d} of {n:04d}' logger.info(msg.format(prefix=FPD_LOG_PREFIX, id=site.id, domain=site.domain, i=i, n=sites_count)) try: populate_daily_metrics_for_site(site_id=site.id, date_for=date_for, force_update=force_update) except Exception: # pylint: disable=broad-except msg = ('{prefix}:FAIL populate_daily_metrics unhandled site level' ' exception for site[{site_id}]={domain}') logger.exception(msg.format(prefix=FPD_LOG_PREFIX, site_id=site.id, domain=site.domain)) # Until we implement signal triggers if do_update_enrollment_data: try: update_enrollment_data(site_id=site.id) except Exception: # pylint: disable=broad-except msg = ('{prefix}:FAIL figures.tasks update_enrollment_data ' ' unhandled exception. site[{site_id}]:{domain}') logger.exception(msg.format(prefix=FPD_LOG_PREFIX, site_id=site.id, domain=site.domain)) msg = '{prefix}:SITE:END:{id}:{domain} - Site {i:04d} of {n:04d}' logger.info(msg.format(prefix=FPD_LOG_PREFIX, id=site.id, domain=site.domain, i=i, n=sites_count)) msg = '{prefix}:END:date_for={date_for}, site_count={site_count}' logger.info(msg.format(prefix=FPD_LOG_PREFIX, date_for=date_for, site_count=sites_count))