def extract(self, date_for=None, **kwargs): ''' We get the count from the User model since there can be registered users who have not enrolled. TODO: Exclude non-students from the user count ''' if not date_for: date_for = prev_day( datetime.datetime.utcnow().replace(tzinfo=utc).date()) data = dict() user_count = get_user_model().objects.filter( date_joined__lt=as_datetime(next_day(date_for))).count() course_count = CourseOverview.objects.filter( created__lt=as_datetime(next_day(date_for))).count() todays_active_user_count = get_active_user_count_for_date(date_for) data['todays_active_user_count'] = todays_active_user_count data[ 'cumulative_active_user_count'] = get_previous_cumulative_active_user_count( date_for) + todays_active_user_count data['total_user_count'] = user_count data['course_count'] = course_count data['total_enrollment_count'] = get_total_enrollment_count(date_for) return data
def load(self, date_for=None, force_update=False, **_kwargs): """ TODO: clean up how we do this. We want to be able to call the loader with an existing data set (not having to call the extractor) but we need to make sure that the metrics row 'date_for' is the same as provided in the data. So before hacking something together, I want to think this over some more. If the record alrdady exists and force_update is False, then simply return the record with the 'created' flag to False. This saves us an unnecessary call to extract data Raises ValidationError if invalid data is attempted to be saved to the course daily metrics model instance """ if not date_for: date_for = prev_day( datetime.datetime.utcnow().replace(tzinfo=utc).date()) else: date_for = as_datetime(date_for).replace(tzinfo=utc) try: cdm = CourseDailyMetrics.objects.get(course_id=self.course_id, date_for=date_for) # record found, only update if force update flag is True if not force_update: return ( cdm, False, ) except CourseDailyMetrics.DoesNotExist: # record not found, move on to creating pass data = self.get_data(date_for=date_for) return self.save_metrics(date_for=date_for, data=data)
def calc_from_user_model(): filter_args = dict( date_joined__gt=prev_day(start_date), date_joined__lt=next_day(end_date), ) return get_user_model().objects.filter( **filter_args).values('id').distinct().count()
def calc_from_user_model(): filter_args = dict( date_joined__gt=as_datetime(prev_day(start_date)), date_joined__lt=as_datetime(next_day(end_date)), ) users = figures.sites.get_users_for_site(site) return users.filter(**filter_args).values('id').distinct().count()
def setup(self, db): self.date_for = datetime.date(2018, 10, 1) self.site = Site.objects.first() self.users = [ UserFactory(date_joined=as_datetime(self.date_for - datetime.timedelta(days=60))) for i in range(0, 3) ] self.course_overviews = [ CourseOverviewFactory( created=as_datetime(self.date_for - datetime.timedelta(days=60))) for i in range(0, 3) ] self.cdm_recs = [ CourseDailyMetricsFactory(site=self.site, date_for=self.date_for, **cdm) for cdm in CDM_INPUT_TEST_DATA ] self.prev_day_sdm = SiteDailyMetricsFactory(site=self.site, date_for=prev_day( self.date_for), **SDM_DATA[1]) if is_multisite(): self.organization = OrganizationFactory(sites=[self.site]) for co in self.course_overviews: OrganizationCourseFactory(organization=self.organization, course_id=str(co.id)) if organizations_support_sites(): for user in self.users: UserOrganizationMappingFactory( user=user, organization=self.organization)
def extract(self, site, date_for=None, **kwargs): # pylint: disable=unused-argument ''' We get the count from the User model since there can be registered users who have not enrolled. TODO: Exclude non-students from the user count ''' if not date_for: date_for = prev_day( datetime.datetime.utcnow().replace(tzinfo=utc).date()) data = dict() site_users = get_users_for_site(site) user_count = site_users.filter( date_joined__lt=as_datetime(next_day(date_for))).count() site_courses = get_courses_for_site(site) course_count = site_courses.filter( created__lt=as_datetime(next_day(date_for))).count() todays_active_users = get_site_active_users_for_date(site, date_for) todays_active_user_count = todays_active_users.count() mau = site_mau_1g_for_month_as_of_day(site, date_for) data['todays_active_user_count'] = todays_active_user_count data[ 'cumulative_active_user_count'] = get_previous_cumulative_active_user_count( site, date_for) + todays_active_user_count data['total_user_count'] = user_count data['course_count'] = course_count data['total_enrollment_count'] = get_total_enrollment_count( site, date_for) data['mau'] = mau.count() return data
def calc_from_course_enrollments(): filter_args = dict( created__gt=prev_day(start_date), created__lt=next_day(end_date), ) return CourseEnrollment.objects.filter( **filter_args).values('course_id').distinct().count()
def generate_cdm_data_for_course(course_id): """ Just getting it working first, then we'll make the values more reasonable like value = sorted([lower_bound, x, upper_bound])[1] """ cdm_data = [] yesterday = {} end_date = prev_day(datetime.datetime.now()) start_date = days_from(end_date, -180) for dt in rrule(DAILY, dtstart=start_date, until=end_date): enrollment_count = yesterday.get('enrollment_count', 0) + randint( 0, 10) average_progress = gen_avg_progress( yesterday.get('average_progress', 0)) average_days_to_complete = randint(10, 30) num_learners_completed = gen_num_learners_completed(yesterday) rec = dict( course_id=course_id, date_for=dt.strftime('%Y-%m-%d'), enrollment_count=enrollment_count, active_learners_today=randint(0, enrollment_count // 2), average_progress=average_progress, average_days_to_complete=average_days_to_complete, num_learners_completed=num_learners_completed, ) cdm_data.append(rec) yesterday = rec return cdm_data
def get_active_users_for_time_period(site, start_date, end_date, course_ids=None): """ Returns the number of users active in the time period. This is determined by finding the unique user ids for StudentModule records modified in a time period We don't do this only because it raises timezone warnings modified__range=(as_date(start_date), as_date(end_date)), """ # Get list of learners for the site user_ids = figures.sites.get_user_ids_for_site(site) filter_args = dict( modified__gt=as_datetime(prev_day(start_date)), modified__lt=as_datetime(next_day(end_date)), student_id__in=user_ids, ) if course_ids: filter_args['course_ids__in'] = course_ids return StudentModule.objects.filter( **filter_args).values('student__id').distinct().count()
def calc_from_site_daily_metrics(): filter_args = dict(date_for__gt=prev_day(start_date), date_for__lt=next_day(end_date)) qs = SiteDailyMetrics.objects.filter(**filter_args) if qs: return qs.aggregate(maxval=Max('total_user_count'))['maxval'] else: return 0
def test_get_previous_cumulative_active_user_count(self, prev_day_data, expected): if prev_day_data: SiteDailyMetricsFactory( date_for=prev_day(self.date_for), **prev_day_data) actual = pipeline_sdm.get_previous_cumulative_active_user_count( date_for=self.date_for) assert actual == expected
def extract(self, course_id, date_for=None, **kwargs): """ defaults = dict( enrollment_count=data['enrollment_count'], active_learners_today=data['active_learners_today'], average_progress=data.get('average_progress', None), average_days_to_complete=data.get('average_days_to_complete, None'), num_learners_completed=data['num_learners_completed'], ) """ # Update args if not assigned if not date_for: date_for = prev_day( datetime.datetime.utcnow().replace(tzinfo=utc).date()) # We can turn this series of calls into a parallel # set of calls defined in a ruleset instead of hardcoded here # Get querysets and datasets we'll use # We do this to reduce calls course_enrollments = get_course_enrollments( course_id, date_for, ) data = dict(date_for=date_for, course_id=course_id) # This is the transform step # After we get this working, we can then define them declaratively # we can do a lambda for course_enrollments to get the count data['enrollment_count'] = course_enrollments.count() active_learner_ids_today = get_active_learner_ids_today( course_id, date_for, ) if active_learner_ids_today: active_learners_today = active_learner_ids_today.count() else: active_learners_today = 0 data['active_learners_today'] = active_learners_today data['average_progress'] = get_average_progress( course_id, date_for, course_enrollments, ) data['average_days_to_complete'] = get_average_days_to_complete( course_id, date_for, ) data['num_learners_completed'] = get_num_learners_completed( course_id, date_for, ) return data
def calc_from_course_enrollments(): filter_args = dict( created__gt=prev_day(start_date), created__lt=next_day(end_date), ) # First get all the course enrollments for the site ce = figures.sites.get_course_enrollments_for_site(site) # Then filter on the time period return ce.filter(**filter_args).values('course_id').distinct().count()
def calc_from_course_daily_metrics(): filter_args = dict( date_for__gt=prev_day(start_date), date_for__lt=next_day(end_date), ) qs = CourseDailyMetrics.objects.filter(**filter_args) if qs: return qs.aggregate(maxval=Max('num_learners_completed'))['maxval'] else: return 0
def get_previous_cumulative_active_user_count(date_for): ''' Returns the cumulative site-wide active user count for the previous day This is a simple helper function that returns the cumulative active user count for the day before the given date. Returns 0 if there is no record for the previous day ''' try: return SiteDailyMetrics.objects.get( date_for=prev_day(date_for)).cumulative_active_user_count or 0 except SiteDailyMetrics.DoesNotExist: return 0
def get_course_average_progress_for_time_period(start_date, end_date, course_id): filter_args = dict(date_for__gt=prev_day(start_date), date_for__lt=next_day(end_date), course_id=course_id) qs = CourseDailyMetrics.objects.filter(**filter_args) if qs: value = qs.aggregate(average=Avg('average_progress'))['average'] return float(Decimal(value).quantize(Decimal('.00'))) else: return 0.0
def get_course_enrolled_users_for_time_period(start_date, end_date, course_id): """ """ filter_args = dict(date_for__gt=prev_day(start_date), date_for__lt=next_day(end_date), course_id=course_id) qs = CourseDailyMetrics.objects.filter(**filter_args) if qs: return qs.aggregate(maxval=Max('enrollment_count'))['maxval'] else: return 0
def setup(self, db): self.today = datetime.date(2018, 6, 1) self.course_overview = CourseOverviewFactory() if OPENEDX_RELEASE == GINKGO: self.course_enrollments = [ CourseEnrollmentFactory(course_id=self.course_overview.id) for i in range(4) ] else: self.course_enrollments = [ CourseEnrollmentFactory(course=self.course_overview) for i in range(4) ] if organizations_support_sites(): self.my_site = SiteFactory(domain='my-site.test') self.my_site_org = OrganizationFactory(sites=[self.my_site]) OrganizationCourseFactory(organization=self.my_site_org, course_id=str(self.course_overview.id)) for ce in self.course_enrollments: UserOrganizationMappingFactory(user=ce.user, organization=self.my_site_org) self.course_access_roles = [ CourseAccessRoleFactory( user=self.course_enrollments[i].user, course_id=self.course_enrollments[i].course_id, role=role, ) for i, role in enumerate(self.COURSE_ROLES) ] # create student modules for yesterday and today for day in [prev_day(self.today), self.today]: self.student_modules = [ StudentModuleFactory(course_id=ce.course_id, student=ce.user, created=ce.created, modified=as_datetime(day)) for ce in self.course_enrollments ] self.cert_days_to_complete = [10, 20, 30] self.expected_avg_cert_days_to_complete = 20 self.generated_certificates = [ GeneratedCertificateFactory( user=self.course_enrollments[i].user, course_id=self.course_enrollments[i].course_id, created_date=(self.course_enrollments[i].created + datetime.timedelta(days=days)), ) for i, days in enumerate(self.cert_days_to_complete) ]
def pipeline_date_for_rule(date_for): """Common logic to assign the 'date_for' date for daily pipeline processing * If 'date_for' is 'None' or today, then this function returns a 'datetime.date' instance for yesterday * If 'date_for' is a date in the past, this function returns the 'datetime.date' representation of the date * If 'date_for' is in the future, then `DateForCannotBeFutureError` is raised As part of normal Figures data collection, the pipeline must collect data from the previous calendar day, assuming all timestamps are UTC. This is to build a complete picture of a 24 hour period. This function exists to have this logic in a single place in the code. This logic is specific to the pipeline so it belongs in Figures' pipeline namespce. We may rework this as a decorator or as part of core functionality in a base class from which daily metrics classes can derive. """ today = datetime.utcnow().replace(tzinfo=utc).date() if not date_for: date_for = prev_day(today) else: # Because we are working on the calendar day and the daily metrics # models use date and not datetime for the 'date_for' fields date_for = as_date(date_for) # Either we are backfilling data (if the date is prior to yesterday) # or the caller explicity requests to process for yesterday if date_for > today: msg = 'Attempted pipeline call with future date: "{date_for}"' raise DateForCannotBeFutureError(msg.format(date_for=date_for)) elif date_for == today: return prev_day(today) return date_for
def get_course_average_days_to_complete_for_time_period( start_date, end_date, course_id): filter_args = dict(date_for__gt=prev_day(start_date), date_for__lt=next_day(end_date), course_id=course_id) qs = CourseDailyMetrics.objects.filter(**filter_args) if qs: return int( math.ceil( qs.aggregate( average=Avg('average_days_to_complete'))['average'])) else: return 0
def get_course_num_learners_completed_for_time_period(start_date, end_date, course_id): """ We're duplicating some code. """ filter_args = dict(date_for__gt=prev_day(start_date), date_for__lt=next_day(end_date), course_id=course_id) qs = CourseDailyMetrics.objects.filter(**filter_args) if qs: return qs.aggregate(max=Max('num_learners_completed'))['max'] else: return 0
def setup(self, db): self.date_for = datetime.date(2018, 10, 1) self.users = [UserFactory( date_joined=as_datetime(self.date_for - datetime.timedelta(days=60)) ) for i in range(0, 3)] self.course_overviews = [CourseOverviewFactory( created=as_datetime(self.date_for - datetime.timedelta(days=60)) ) for i in range(0, 3)] self.cdm_recs = [CourseDailyMetricsFactory( date_for=self.date_for, **cdm ) for cdm in CDM_INPUT_TEST_DATA] self.prev_day_sdm = SiteDailyMetricsFactory( date_for=prev_day(self.date_for), **SDM_PREV_DAY[1])
def load(self, site, date_for=None, force_update=False, **_kwargs): ''' Architectural note: Initially, we're going to be explicit, requiring callers to specify the site model instance to be associated with the site specific metrics model(s) we are populating TODOs: Add filtering for * Multi-tenancy * Course acess groups ''' if not date_for: date_for = prev_day( datetime.datetime.utcnow().replace(tzinfo=utc).date()) else: date_for = as_datetime(date_for).replace(tzinfo=utc) # if we already have a record for the date_for and force_update is False # then skip getting data if not force_update: try: sdm = SiteDailyMetrics.objects.get(site=site, date_for=date_for) return ( sdm, False, ) except SiteDailyMetrics.DoesNotExist: # proceed normally pass data = self.extractor.extract(site=site, date_for=date_for) site_metrics, created = SiteDailyMetrics.objects.update_or_create( date_for=date_for, site=site, defaults=dict( cumulative_active_user_count=data[ 'cumulative_active_user_count'], todays_active_user_count=data['todays_active_user_count'], total_user_count=data['total_user_count'], course_count=data['course_count'], total_enrollment_count=data['total_enrollment_count'], mau=data['mau'], )) return site_metrics, created
def get_active_users_for_time_period(start_date, end_date, site=None, course_ids=None): """ Returns the number of users active in the time period. This is determined by finding the unique user ids for StudentModule records modified in a time period """ filter_args = dict(created__gt=as_datetime(prev_day(start_date)), modified__lt=as_datetime(next_day(end_date))) if course_ids: filter_args['course_ids__in'] = course_ids return StudentModule.objects.filter( **filter_args).values('student__id').distinct().count()
def get_total_enrollments_for_time_period(site, start_date, end_date, course_ids=None): # pylint: disable=unused-argument """Returns the maximum number of enrollments This returns the count of unique enrollments, not unique learners """ filter_args = dict( site=site, date_for__gt=prev_day(start_date), date_for__lt=next_day(end_date), ) qs = SiteDailyMetrics.objects.filter(**filter_args) if qs: return qs.aggregate(maxval=Max('total_enrollment_count'))['maxval'] else: return 0
def load(self, date_for=None, force_update=False, **kwargs): """ TODO: clean up how we do this. We want to be able to call the loader with an existing data set (not having to call the extractor) but we need to make sure that the metrics row 'date_for' is the same as provided in the data. So before hacking something together, I want to think this over some more. """ if not date_for: date_for = prev_day( datetime.datetime.utcnow().replace(tzinfo=utc).date()) # if we already have a record for the date_for and force_update is False # then skip getting data if not force_update: try: cdm = CourseDailyMetrics.objects.get(course_id=self.course_id, date_for=date_for) return ( cdm, False, ) except CourseDailyMetrics.DoesNotExist: # proceed normally pass data = self.get_data(date_for=date_for) cdm, created = CourseDailyMetrics.objects.update_or_create( course_id=self.course_id, date_for=date_for, defaults=dict( enrollment_count=data['enrollment_count'], active_learners_today=data['active_learners_today'], average_progress=data['average_progress'], average_days_to_complete=data['average_days_to_complete'], num_learners_completed=data['num_learners_completed'], )) return ( cdm, created, )
def get_total_site_users_for_time_period(site, start_date, end_date, **_kwargs): """ Returns the maximum number of users who joined before or on the end date Even though we don't need the start_date, we follow the method signature for the other metrics functions so we can use the same handler method, ``get_monthly_history_metric`` TODO: Consider first trying to get the data from the SiteDailyMetrics model. If there are no records, then get the data from the User model """ filter_args = dict(site=site, date_for__gt=prev_day(start_date), date_for__lt=next_day(end_date)) qs = SiteDailyMetrics.objects.filter(**filter_args) if qs: return qs.aggregate(maxval=Max('total_user_count'))['maxval'] else: return 0
def load(self, date_for=None, force_update=False, **kwargs): ''' TODO: Add filtering for * Multi-tenancy * Course acess groups ''' if not date_for: date_for = prev_day( datetime.datetime.utcnow().replace(tzinfo=utc).date()) # if we already have a record for the date_for and force_update is False # then skip getting data if not force_update: try: sdm = SiteDailyMetrics.objects.get(date_for=date_for) return ( sdm, False, ) except SiteDailyMetrics.DoesNotExist: # proceed normally pass data = self.extractor.extract(date_for=date_for) site_metrics, created = SiteDailyMetrics.objects.update_or_create( date_for=date_for, defaults=dict( cumulative_active_user_count=data[ 'cumulative_active_user_count'], todays_active_user_count=data['todays_active_user_count'], total_user_count=data['total_user_count'], course_count=data['course_count'], total_enrollment_count=data['total_enrollment_count'], )) return site_metrics, created
def setup(self, db): self.today = datetime.date(2018, 6, 1) self.course_overview = CourseOverviewFactory() self.course_enrollments = [ CourseEnrollmentFactory(course_id=self.course_overview.id) for i in range(4) ] self.course_access_roles = [ CourseAccessRoleFactory( user=self.course_enrollments[i].user, course_id=self.course_enrollments[i].course_id, role=role, ) for i, role in enumerate(self.COURSE_ROLES) ] # create student modules for yesterday and today for day in [prev_day(self.today), self.today]: self.student_modules = [ StudentModuleFactory(course_id=ce.course_id, student=ce.user, created=ce.created, modified=as_datetime(day)) for ce in self.course_enrollments ] self.cert_days_to_complete = [10, 20, 30] self.expected_avg_cert_days_to_complete = 20 self.generated_certificates = [ GeneratedCertificateFactory( user=self.course_enrollments[i].user, course_id=self.course_enrollments[i].course_id, created_date=(self.course_enrollments[i].created + datetime.timedelta(days=days)), ) for i, days in enumerate(self.cert_days_to_complete) ]
def extract(self, course_id, date_for=None, **_kwargs): """ defaults = dict( enrollment_count=data['enrollment_count'], active_learners_today=data['active_learners_today'], average_progress=data.get('average_progress', None), average_days_to_complete=data.get('average_days_to_complete, None'), num_learners_completed=data['num_learners_completed'], ) TODO: refactor this class Add lazy loading method to load course enrollments - Create a method for each metric field """ # Update args if not assigned if not date_for: date_for = prev_day( datetime.datetime.utcnow().replace(tzinfo=utc).date()) # We can turn this series of calls into a parallel # set of calls defined in a ruleset instead of hardcoded here after # retrieving the core quersets course_enrollments = get_enrolled_in_exclude_admins( course_id, date_for, ) data = dict(date_for=date_for, course_id=course_id) # This is the transform step # After we get this working, we can then define them declaratively # we can do a lambda for course_enrollments to get the count data['enrollment_count'] = course_enrollments.count() active_learner_ids_today = get_active_learner_ids_today( course_id, date_for, ) if active_learner_ids_today: active_learners_today = active_learner_ids_today.count() else: active_learners_today = 0 data['active_learners_today'] = active_learners_today # Average progress progress_data = bulk_calculate_course_progress_data( course_id=course_id, date_for=date_for) data['average_progress'] = progress_data['average_progress'] data['average_days_to_complete'] = get_average_days_to_complete( course_id, date_for, ) data['num_learners_completed'] = get_num_learners_completed( course_id, date_for, ) return data