def make_enrollment_events(course_id, force_recompute=False, use_dataset_latest=False, end_date=None): ''' Create an enrollment_events table, based on scanning the tracking logs. Record all enrollment events, ordered by time. ''' SQL = """ SELECT "{course_id}" as course_id, time, (case when (event_struct.user_id is not null) then event_struct.user_id when (context.user_id is not null) then context.user_id end) as user_id, (case when (event_struct.mode is not null) then event_struct.mode else JSON_EXTRACT_SCALAR(event, "$.mode") end) as mode, (case when (event_type = "edx.course.enrollment.activated") then True else False end) as activated, (case when (event_type = "edx.course.enrollment.deactivated") then True else False end) as deactivated, (case when (event_type = "edx.course.enrollment.mode_changed") then True else False end) as mode_changed, (case when (event_type = "edx.course.enrollment.upgrade.succeeded") then True else False end) as upgraded, event_type, FROM {DATASETS} where (event_type contains "edx.course.enrollment") and time > TIMESTAMP("{last_date}") order by time; """ table = 'enrollment_events' def gdf(row): return datetime.datetime.utcfromtimestamp(float(row['time'])) process_tracking_logs.run_query_on_tracking_logs(SQL, table, course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest, end_date=end_date, get_date_function=gdf, days_delta=0)
def process_course(course_id, force_recompute=False, use_dataset_latest=False, end_date=None): ''' Create one enrollday_all table, containing time, user_id, and enrollment stats. Just as is done for problem_check tables, the master table holds all the events extracted from the tracking logs for a course. It isn't split into separate days. It is ordered in time, however. To update it, a new day's logs are processed, then the results appended to this table. If the enrollday_all table doesn't exist, then run it once on all the existing tracking logs. If it already exists, then run a query on it to see what dates have already been done. Then do all tracking logs except those which have already been done. Append the results to the existing table. ''' SQL = """ SELECT "{course_id}" as course_id, time, event_struct.user_id as user_id, (case when (event_type = "edx.course.enrollment.activated" and event_struct.mode = "honor") then 1 when (event_type = "edx.course.enrollment.deactivated" and event_struct.mode = "honor") then -1 else 0 end) as diff_enrollment_honor, (case when (event_type = "edx.course.enrollment.activated" and event_struct.mode = "verified") then 1 when (event_type = "edx.course.enrollment.deactivated" and event_struct.mode = "verified") then -1 else 0 end) as diff_enrollment_verified, (case when (event_type = "edx.course.enrollment.activated" and event_struct.mode = "audit") then 1 when (event_type = "edx.course.enrollment.deactivated" and event_struct.mode = "audit") then -1 else 0 end) as diff_enrollment_audit, FROM {DATASETS} where (event_type = "edx.course.enrollment.activated") or (event_type = "edx.course.enrollment.deactivated") and time > TIMESTAMP("{last_date}") order by time; """ table = 'enrollday_all' def gdf(row): return datetime.datetime.utcfromtimestamp(float(row['time'])) process_tracking_logs.run_query_on_tracking_logs(SQL, table, course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest, end_date=end_date, get_date_function=gdf, days_delta=0)
def createVideoStats_day( course_id, force_recompute=False, use_dataset_latest=False, skip_last_day=False, end_date=None): ''' Create video statistics per ay for viewed by looking for users who had a video position > 0, and watched by looking for users who had a video position > 95% of the total video length duration. ''' dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest) logs = bqutil.course_id2dataset(course_id, dtype='logs') table = TABLE_VIDEO_STATS_PER_DAY the_sql = """ SELECT date(time)as date, username, #module_id as video_id, #REGEXP_REPLACE(REGEXP_EXTRACT(JSON_EXTRACT(event, '$.id'), r'(?:i4x-)(.*)(?:"$)'), '-', '/') as video_id, # Old method takes full video id path (case when REGEXP_MATCH( JSON_EXTRACT(event, '$.id') , r'([-])' ) then REGEXP_EXTRACT(REGEXP_REPLACE(REGEXP_REPLACE(REGEXP_REPLACE(JSON_EXTRACT(event, '$.id'), '-', '/'), '"', ''), 'i4x/', ''), r'(?:.*\/)(.*)') else REGEXP_REPLACE(REGEXP_REPLACE(REGEXP_REPLACE(JSON_EXTRACT(event, '$.id'), '-', '/'), '"', ''), 'i4x/', '') end) as video_id, # This takes video id only max(case when JSON_EXTRACT_SCALAR(event, '$.speed') is not null then float(JSON_EXTRACT_SCALAR(event,'$.speed'))*float(JSON_EXTRACT_SCALAR(event, '$.currentTime')) else float(JSON_EXTRACT_SCALAR(event, '$.currentTime')) end) as position, FROM {DATASETS} WHERE (event_type = "play_video" or event_type = "pause_video" or event_type = "stop_video") and event is not null group by username, video_id, date order by date """ try: tinfo = bqutil.get_bq_table_info(dataset, TABLE_VIDEO_STATS_PER_DAY ) assert tinfo is not None, "[analyze_videos] Creating %s.%s table for %s" % (dataset, TABLE_VIDEO_STATS_PER_DAY, course_id) print "[analyze_videos] Appending latest data to %s.%s table for %s" % (dataset, TABLE_VIDEO_STATS_PER_DAY, course_id) sys.stdout.flush() except (AssertionError, Exception) as err: print str(err) sys.stdout.flush() print " --> Missing %s.%s? Attempting to create..." % ( dataset, TABLE_VIDEO_STATS_PER_DAY ) sys.stdout.flush() pass print "=== Processing Video Stats Per Day for %s (start %s)" % (course_id, datetime.datetime.now()) sys.stdout.flush() def gdf(row): return datetime.datetime.strptime(row['date'], '%Y-%m-%d') process_tracking_logs.run_query_on_tracking_logs(the_sql, table, course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest, get_date_function=gdf, skip_last_day=skip_last_day) print "Done with Video Stats Per Day for %s (end %s)" % (course_id, datetime.datetime.now()) print "="*77 sys.stdout.flush()
def compute_person_course_day_ip_table(course_id, force_recompute=False, use_dataset_latest=False, end_date=None): ''' make pcday_ip_counts table for specified course_id. The master table holds all the (username, course_id, date, ip_addr, nip_count) data for a course. It isn't split into separate days. It is ordered in time, however. To update it, a new day's logs are processed, then the results appended to this table. This is used in computing the modal IP address of users. If the pcday_ip_counts table doesn't exist, then run it once on all the existing tracking logs. If it already exists, then run a query on it to see what dates have already been done. Then do all tracking logs except those which have already been done. Append the results to the existing table. If the query fails because of "Resources exceeded during query execution" then try setting the end_date, to do part at a time. ''' SQL = """ SELECT username, ip, date(time) as date, count(*) as ipcount, '{course_id}' as course_id, FROM {DATASETS} where username != "" group by username, ip, date order by date """ table = 'pcday_ip_counts' def gdf(row): return datetime.datetime.strptime(row['date'], '%Y-%m-%d') process_tracking_logs.run_query_on_tracking_logs( SQL, table, course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest, end_date=end_date, get_date_function=gdf)
def compute_person_course_day_ip_table(course_id, force_recompute=False, use_dataset_latest=False, end_date=None): ''' make pcday_ip_counts table for specified course_id. The master table holds all the (username, course_id, date, ip_addr, nip_count) data for a course. It isn't split into separate days. It is ordered in time, however. To update it, a new day's logs are processed, then the results appended to this table. This is used in computing the modal IP address of users. If the pcday_ip_counts table doesn't exist, then run it once on all the existing tracking logs. If it already exists, then run a query on it to see what dates have already been done. Then do all tracking logs except those which have already been done. Append the results to the existing table. If the query fails because of "Resources exceeded during query execution" then try setting the end_date, to do part at a time. ''' SQL = """ SELECT username, ip, date(time) as date, count(*) as ipcount, '{course_id}' as course_id, FROM {DATASETS} where username != "" group by username, ip, date order by date """ table = 'pcday_ip_counts' def gdf(row): return datetime.datetime.strptime(row['date'], '%Y-%m-%d') process_tracking_logs.run_query_on_tracking_logs(SQL, table, course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest, end_date=end_date, get_date_function=gdf)
def process_course(course_id, force_recompute=False, use_dataset_latest=False, end_date=None, check_dates=True, skip_last_day=False): ''' Make {course_id}.person_course_day table for specified course_id. This is a single course-specific table, which contains all day's data. It is incrementally updated when new tracking logs data comes in, by appending rows to the end. The rows are kept in time order. check_dates is disregarded. If skip_last_day is True then do not include the last day of tracking log data in the processing. This is done to avoid processing partial data, e.g. when tracking log data are incrementally loaded with a delta of less than one day. ''' dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest) videoTableExists = False try: tinfo_video = bqutil.get_bq_table_info(dataset, 'video_stats_day') assert tinfo_video is not None, "Video stats table missing... Not including video stats" videoTableExists = True except (AssertionError, Exception) as err: #print " --> Err: missing %s.%s? Skipping creation of chapter_grades" % (dataset, "course_axis") sys.stdout.flush() pass #return forumTableExists = False try: tinfo_forum = bqutil.get_bq_table_info(dataset, 'forum_events') assert tinfo_forum is not None, "Forum events table missing... Not including forum stats" forumTableExists = True except (AssertionError, Exception) as err: #print " --> Err: missing %s.%s? Skipping creation of chapter_grades" % (dataset, "course_axis") sys.stdout.flush() pass #return problemTableExists = False try: tinfo_personproblem = bqutil.get_bq_table_info(dataset, 'person_problem') tinfo_courseproblem = bqutil.get_bq_table_info(dataset, 'course_problem') tinfo_courseaxis = bqutil.get_bq_table_info(dataset, 'course_axis') tinfo_personcourse = bqutil.get_bq_table_info(dataset, 'person_course') # Check course axis and person course, course problem assert tinfo_personproblem is not None, "Person problem table missing... Not including problem stats" assert tinfo_courseproblem is not None, "Course problem table missing... Not including problem stats" assert tinfo_courseaxis is not None, "Course axis table missing... Not including problem stats" assert tinfo_personcourse is not None, "Person Course table missing... Not including problem stats" problemTableExists = True except (AssertionError, Exception) as err: #print " --> Err: missing %s.%s? Skipping creation of chapter_grades" % (dataset, "course_axis") sys.stdout.flush() pass PCDAY_SQL_BASE_SELECT = """ SELECT username, '{course_id}' AS course_id, DATE(time) AS date, SUM(bevent) AS nevents, SUM(bprogress) AS nprogcheck, SUM(bshow_answer) AS nshow_answer, SUM(bvideo) AS nvideo, SUM(bproblem_check) AS nproblem_check, SUM(bforum) AS nforum, SUM(bshow_transcript) AS ntranscript, SUM(bseq_goto) AS nseq_goto, SUM(bseek_video) AS nseek_video, SUM(bpause_video) AS npause_video, """ PCDAY_SQL_VIDEO_EXISTS = """ COUNT(DISTINCT video_id) AS nvideos_viewed, # New Video - Unique videos viewed SUM(case when position is not null then FLOAT(position) else FLOAT(0.0) end) AS nvideos_watched_sec, # New Video - # sec watched using max video position """ PCDAY_SQL_VIDEO_DNE = """ 0 AS nvideos_viewed, # New Video - Unique videos viewed FLOAT(0.0) AS nvideos_watched_sec, # New Video - # sec watched using max video position """ PCDAY_SQL_VIDEO_SELECT = PCDAY_SQL_VIDEO_EXISTS if videoTableExists else PCDAY_SQL_VIDEO_DNE PCDAY_SQL_FORUM_EXISTS = """ SUM(case when read is not null then read else 0 end) AS nforum_reads, # New discussion - Forum reads SUM(case when write is not null then write else 0 end) AS nforum_posts, # New discussion - Forum posts COUNT(DISTINCT thread_id ) AS nforum_threads, # New discussion - Unique forum threads interacted with """ PCDAY_SQL_FORUM_DNE = """ 0 AS nforum_reads, # New discussion - Forum reads 0 AS nforum_posts, # New discussion - Forum posts 0 AS nforum_threads, # New discussion - Unique forum threads interacted with """ PCDAY_SQL_FORUM_SELECT = PCDAY_SQL_FORUM_EXISTS if forumTableExists else PCDAY_SQL_FORUM_DNE PCDAY_SQL_PROBLEM_EXISTS = """ COUNT(DISTINCT problem_nid ) AS nproblems_answered, # New Problem - Unique problems attempted SUM(case when n_attempts is not null then n_attempts else 0 end) AS nproblems_attempted, # New Problem - Total attempts SUM(case when ncount_problem_multiplechoice is not null then ncount_problem_multiplechoice else 0 end) as nproblems_multiplechoice, SUM(case when ncount_problem_choice is not null then ncount_problem_choice else 0 end) as nproblems_choice, SUM(case when ncount_problem_numerical is not null then ncount_problem_numerical else 0 end) as nproblems_numerical, SUM(case when ncount_problem_option is not null then ncount_problem_option else 0 end) as nproblems_option, SUM(case when ncount_problem_custom is not null then ncount_problem_custom else 0 end) as nproblems_custom, SUM(case when ncount_problem_string is not null then ncount_problem_string else 0 end) as nproblems_string, SUM(case when ncount_problem_mixed is not null then ncount_problem_mixed else 0 end) as nproblems_mixed, SUM(case when ncount_problem_formula is not null then ncount_problem_formula else 0 end) as nproblems_forumula, SUM(case when ncount_problem_other is not null then ncount_problem_other else 0 end) as nproblems_other, """ PCDAY_SQL_PROBLEM_DNE = """ 0 AS nproblems_answered, # New Problem - Unique problems attempted 0 AS nproblems_attempted, # New Problem - Total attempts 0 AS nproblems_multiplechoice, 0 AS nproblems_choice, 0 AS nproblems_numerical, 0 AS nproblems_option, 0 AS nproblems_custom, 0 AS nproblems_string, 0 AS nproblems_mixed, 0 AS nproblems_forumula, 0 AS nproblems_other, """ PCDAY_SQL_PROBLEM_SELECT = PCDAY_SQL_PROBLEM_EXISTS if problemTableExists else PCDAY_SQL_PROBLEM_DNE PCDAY_SQL_MID = """ MAX(time) AS last_event, AVG( CASE WHEN (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 > 5*60 THEN NULL ELSE (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 END ) AS avg_dt, STDDEV( CASE WHEN (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 > 5*60 THEN NULL ELSE (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 END ) AS sdv_dt, MAX( CASE WHEN (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 > 5*60 THEN NULL ELSE (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 END ) AS max_dt, COUNT( CASE WHEN (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 > 5*60 THEN NULL ELSE (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 END ) AS n_dt, SUM( CASE WHEN (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 > 5*60 THEN NULL ELSE (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 END ) AS sum_dt FROM ( SELECT * FROM ( SELECT username, CASE WHEN event_type = "play_video" THEN 1 ELSE 0 END AS bvideo, CASE WHEN event_type = "problem_check" THEN 1 ELSE 0 END AS bproblem_check, CASE WHEN username != "" THEN 1 ELSE 0 END AS bevent, CASE WHEN REGEXP_MATCH(event_type, "^/courses/{course_id}/discussion/.*") then 1 else 0 end as bforum, CASE WHEN REGEXP_MATCH(event_type, "^/courses/{course_id}/progress") then 1 else 0 end as bprogress, CASE WHEN event_type IN ("show_answer", "showanswer") THEN 1 ELSE 0 END AS bshow_answer, CASE WHEN event_type = 'show_transcript' THEN 1 ELSE 0 END AS bshow_transcript, CASE WHEN event_type = 'seq_goto' THEN 1 ELSE 0 END AS bseq_goto, CASE WHEN event_type = 'seek_video' THEN 1 ELSE 0 END AS bseek_video, CASE WHEN event_type = 'pause_video' THEN 1 ELSE 0 END AS bpause_video, # case when event_type = 'edx.course.enrollment.activated' then 1 else 0 end as benroll, # case when event_type = 'edx.course.enrollment.deactivated' then 1 else 0 end as bunenroll time, LAG(time, 1) OVER (PARTITION BY username ORDER BY time) last_time FROM {DATASETS} WHERE NOT event_type CONTAINS "/xblock/" AND username != "" ) """ PCDAY_SQL_VIDEO = """ , ( # Video events SELECT TIMESTAMP(date) as time, '{course_id}' as course_id, username, video_id, position, FROM [{dataset}.video_stats_day] WHERE TIMESTAMP(date)>= TIMESTAMP("{min_date_start}") and TIMESTAMP(date) <= TIMESTAMP("{max_date_end}") ) """ PCDAY_SQL_ADD = PCDAY_SQL_VIDEO if videoTableExists else '' PCDAY_SQL_FORUM = """ , ( # Forum Events SELECT time, username, '{course_id}' as course_id, thread_id, (CASE WHEN (forum_action == "reply" or forum_action == "comment_reply" or forum_action == "created_thread" or forum_action == "created_response" or forum_action == "created_comment") THEN 1 ELSE 0 END) AS write, (CASE WHEN (forum_action == "read" or forum_action == "read_inline") THEN 1 ELSE 0 END) AS read, FROM [{dataset}.forum_events] WHERE (forum_action == "reply" or forum_action == "comment_reply" or forum_action == "created_thread" or forum_action == "created_response" or forum_action == "created_comment" or forum_action == "read" or forum_action == "read_inline") and ( time >= TIMESTAMP("{min_date_start}") and time <= TIMESTAMP("{max_date_end}") ) ) """ PCDAY_SQL_ADD = PCDAY_SQL_ADD + PCDAY_SQL_FORUM if forumTableExists else PCDAY_SQL_ADD PCDAY_SQL_PROBLEM = """, ( # Problems SELECT pc.username AS username, pp.problem_nid AS problem_nid, pp.n_attempts AS n_attempts, pp.time AS time, '{course_id}' as course_id, pp.ncount_problem_multiplechoice as ncount_problem_multiplechoice, pp.ncount_problem_choice as ncount_problem_choice, pp.ncount_problem_numerical as ncount_problem_numerical, pp.ncount_problem_option as ncount_problem_option, pp.ncount_problem_custom as ncount_problem_custom, pp.ncount_problem_string as ncount_problem_string, pp.ncount_problem_mixed as ncount_problem_mixed, pp.ncount_problem_formula as ncount_problem_formula, pp.ncount_problem_other as ncount_problem_other, FROM ( ( SELECT PP.user_id as user_id, PP.problem_nid AS problem_nid, PP.n_attempts as n_attempts, PP.date as time, (Case when CP_CA.data_itype == "multiplechoiceresponse" then 1 else 0 end) as ncount_problem_multiplechoice, # Choice (Case when CP_CA.data_itype == "choiceresponse" then 1 else 0 end) as ncount_problem_choice, # Choice (Case when CP_CA.data_itype == "numericalresponse" then 1 else 0 end) as ncount_problem_numerical, #input (Case when CP_CA.data_itype == "optionresponse" then 1 else 0 end) as ncount_problem_option, # Choice (Case when CP_CA.data_itype == "customresponse" then 1 else 0 end) as ncount_problem_custom, # Custom (Case when CP_CA.data_itype == "stringresponse" then 1 else 0 end) as ncount_problem_string, # Input (Case when CP_CA.data_itype == "mixed" then 1 else 0 end) as ncount_problem_mixed, # Mixed (Case when CP_CA.data_itype == "forumula" then 1 else 0 end) as ncount_problem_formula, # Input (Case when CP_CA.data_itype != "multiplechoiceresponse" and CP_CA.data_itype != "choiceresponse" and CP_CA.data_itype != "numericalresponse" and CP_CA.data_itype != "optionresponse" and CP_CA.data_itype != "customresponse" and CP_CA.data_itype != "stringresponse" and CP_CA.data_itype != "mixed" and CP_CA.data_itype != "forumula" then 1 else 0 end) as ncount_problem_other, # Input #MAX(n_attempts) AS n_attempts, #MAX(date) AS time, FROM [{dataset}.person_problem] PP LEFT JOIN ( SELECT CP.problem_nid as problem_nid, INTEGER(CP.problem_id) as problem_id, CA.data.itype as data_itype, FROM [{dataset}.course_problem] CP LEFT JOIN [{dataset}.course_axis] CA ON CP.problem_id == CA.url_name ) as CP_CA ON PP.problem_nid == CP_CA.problem_nid GROUP BY time, user_id, problem_nid, n_attempts, ncount_problem_multiplechoice, ncount_problem_choice, ncount_problem_choice, ncount_problem_numerical, ncount_problem_option, ncount_problem_custom, ncount_problem_string, ncount_problem_mixed, ncount_problem_formula, ncount_problem_other ) #FROM [{dataset}.person_item] PI #JOIN [{dataset}.course_item] CI #ON PI.item_nid = CI.item_nid #GROUP BY user_id, #problem_nid #ORDER BY #user_id, #problem_nid ) AS pp LEFT JOIN ( SELECT username, user_id FROM [{dataset}.person_course] ) AS pc ON pc.user_id = pp.user_id WHERE time >= TIMESTAMP("{min_date_start}") and time <= TIMESTAMP("{max_date_end}") ) """ PCDAY_SQL_ADD = PCDAY_SQL_ADD + PCDAY_SQL_PROBLEM if problemTableExists else PCDAY_SQL_ADD PCDAY_SQL_END = """ ) WHERE time > TIMESTAMP("{last_date}") GROUP BY course_id, username, date ORDER BY date """ PCDAY_SQL_NEW = PCDAY_SQL_BASE_SELECT + PCDAY_SQL_VIDEO_SELECT + PCDAY_SQL_FORUM_SELECT + PCDAY_SQL_PROBLEM_SELECT + PCDAY_SQL_MID + PCDAY_SQL_ADD + PCDAY_SQL_END PCDAY_SQL = PCDAY_SQL_NEW.format( dataset=dataset, course_id="{course_id}", DATASETS="{DATASETS}", last_date="{last_date}", min_date_start="{min_date_start}", max_date_end="{max_date_end}") table = 'person_course_day' def gdf(row): return datetime.datetime.strptime(row['date'], '%Y-%m-%d') print "=== Processing person_course_day for %s (start %s)" % (course_id, datetime.datetime.now()) sys.stdout.flush() # Major person_course_day schema revision 19-Jan-2016 adds new fields; if table exists, ensure it # has new schema, else force recompute. try: tinfo = bqutil.get_bq_table_info(dataset, table) except Exception as err: tinfo = None if tinfo: fields = tinfo['schema']['fields'] field_names = [x['name'] for x in fields] if not 'nvideos_viewed' in field_names: cdt = tinfo['creationTime'] print " --> person_course_day created %s; missing nvideos_viewed field in schema; forcing recompute - this may take a long time!" % cdt sys.stdout.flush() force_recompute = True process_tracking_logs.run_query_on_tracking_logs(PCDAY_SQL, table, course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest, end_date=end_date, get_date_function=gdf, newer_than=datetime.datetime( 2016, 1, 19, 22, 30 ), skip_last_day=skip_last_day) print "Done with person_course_day for %s (end %s)" % (course_id, datetime.datetime.now()) print "="*77 sys.stdout.flush()
def createVideoStats_day(course_id, force_recompute=False, use_dataset_latest=False, skip_last_day=False, end_date=None): ''' Create video statistics per ay for viewed by looking for users who had a video position > 0, and watched by looking for users who had a video position > 95% of the total video length duration. ''' dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest) logs = bqutil.course_id2dataset(course_id, dtype='logs') table = TABLE_VIDEO_STATS_PER_DAY the_sql = """ SELECT date(time)as date, username, #module_id as video_id, #REGEXP_REPLACE(REGEXP_EXTRACT(JSON_EXTRACT(event, '$.id'), r'(?:i4x-)(.*)(?:"$)'), '-', '/') as video_id, # Old method takes full video id path (case when REGEXP_MATCH( JSON_EXTRACT(event, '$.id') , r'([-])' ) then REGEXP_EXTRACT(REGEXP_REPLACE(REGEXP_REPLACE(REGEXP_REPLACE(JSON_EXTRACT(event, '$.id'), '-', '/'), '"', ''), 'i4x/', ''), r'(?:.*\/)(.*)') else REGEXP_REPLACE(REGEXP_REPLACE(REGEXP_REPLACE(JSON_EXTRACT(event, '$.id'), '-', '/'), '"', ''), 'i4x/', '') end) as video_id, # This takes video id only max(case when JSON_EXTRACT_SCALAR(event, '$.speed') is not null then float(JSON_EXTRACT_SCALAR(event,'$.speed'))*float(JSON_EXTRACT_SCALAR(event, '$.currentTime')) else float(JSON_EXTRACT_SCALAR(event, '$.currentTime')) end) as position, FROM {DATASETS} WHERE (event_type = "play_video" or event_type = "pause_video" or event_type = "stop_video") and event is not null group by username, video_id, date order by date """ try: tinfo = bqutil.get_bq_table_info(dataset, TABLE_VIDEO_STATS_PER_DAY) assert tinfo is not None, "[analyze_videos] Creating %s.%s table for %s" % ( dataset, TABLE_VIDEO_STATS_PER_DAY, course_id) print "[analyze_videos] Appending latest data to %s.%s table for %s" % ( dataset, TABLE_VIDEO_STATS_PER_DAY, course_id) sys.stdout.flush() except (AssertionError, Exception) as err: print str(err) sys.stdout.flush() print " --> Missing %s.%s? Attempting to create..." % ( dataset, TABLE_VIDEO_STATS_PER_DAY) sys.stdout.flush() pass print "=== Processing Video Stats Per Day for %s (start %s)" % ( course_id, datetime.datetime.now()) sys.stdout.flush() def gdf(row): return datetime.datetime.strptime(row['date'], '%Y-%m-%d') process_tracking_logs.run_query_on_tracking_logs( the_sql, table, course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest, get_date_function=gdf, skip_last_day=skip_last_day) print "Done with Video Stats Per Day for %s (end %s)" % ( course_id, datetime.datetime.now()) print "=" * 77 sys.stdout.flush()
def get_ora_events(course_id, force_recompute=False, use_dataset_latest=False, end_date=None): ''' make ora_events table for specified course_id. The master table holds all the openassessment events extracted from the tracking logs for a course. It isn't split into separate days. It is ordered in time, however. To update it, a new day's logs are processed, then the results appended to this table. If the ora_events table doesn't exist, then run it once on all the existing tracking logs. ''' SQL = """ SELECT (case when module_id_sometimes is not null then REGEXP_EXTRACT(module_id_sometimes, ".*/([^/]+)") when submission_uuid is not null then REGEXP_EXTRACT(last_module_id, ".*/([^/]+)") end) as problem_id, *, (case when module_id_sometimes is not null then module_id_sometimes when submission_uuid is not null then last_module_id end) as module_id, FROM (SELECT *, # must aggregate to get module_id, which is not available for some events, like peer_assess # this means module_id may be missing, e.g. if the creation event happens in a different day's tracking log NTH_VALUE(module_id_sometimes,1) over (partition by submission_uuid order by module_id_sometimes desc) last_module_id, from (SELECT time, username, '{course_id}' as course_id, (case when REGEXP_MATCH(event_type, "openassessmentblock..*") then REGEXP_EXTRACT(event_type, "openassessmentblock.(.*)") when REGEXP_MATCH(event_type, "/courses/.*") then REGEXP_EXTRACT(event_type, "/courses/.*/(handler/.*)") end) as action, (case when event_type = "openassessmentblock.peer_assess" then JSON_EXTRACT_SCALAR(event, "$.submission_uuid") when event_type = "openassessmentblock.self_assess" then JSON_EXTRACT_SCALAR(event, "$.submission_uuid") when event_type = "openassessmentblock.create_submission" then JSON_EXTRACT_SCALAR(event, "$.submission_uuid") when event_type = "openassessmentblock.get_peer_submission" then JSON_EXTRACT_SCALAR(event, "$.submission_returned_uuid") end) as submission_uuid, (case when event_type="openassessmentblock.peer_assess" then JSON_EXTRACT_SCALAR(event, "$.parts[0].option.points") when event_type="openassessmentblock.self_assess" then JSON_EXTRACT_SCALAR(event, "$.parts[0].option.points") end) as part1_points, (case when event_type="openassessmentblock.peer_assess" then JSON_EXTRACT_SCALAR(event, "$.parts[0].feedback") when event_type="openassessmentblock.self_assess" then JSON_EXTRACT_SCALAR(event, "$.parts[0].feedback") end) as part1_feedback, (case when event_type="openassessmentblock.create_submission" then JSON_EXTRACT_SCALAR(event, "$.attempt_number") end) as attempt_num, (case when event_type="openassessmentblock.get_peer_submission" then JSON_EXTRACT_SCALAR(event, "$.item_id") when event_type="openassessmentblock.create_submission" then REGEXP_EXTRACT( JSON_EXTRACT_SCALAR(event, "$.answer.file_key"), ".*(i4x://.*)") when REGEXP_MATCH(event_type, "/courses/.*/xblock/i4x:.*") then REGEXP_REPLACE( REGEXP_EXTRACT( event_type, "/courses/.*/xblock/(i4x:;_[^/]+)/handler.*" ), ";_", "/") end) as module_id_sometimes, event_type, event, event_source, (case when event_type="openassessmentblock.create_submission" then JSON_EXTRACT_SCALAR(event, "$.answer.text") end) as answer_text, from {DATASETS} where event_type contains "openassessment" and event != '{{"POST": {{}}, "GET": {{}}}}' # disregard if user did nothing but look at the page and time > TIMESTAMP("{last_date}") order by time ) order by time ) order by time """ table = 'ora_events' def gdf(row): return datetime.datetime.utcfromtimestamp(float(row['time'])) process_tracking_logs.run_query_on_tracking_logs(SQL, table, course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest, end_date=end_date, get_date_function=gdf, days_delta=0)
def process_course_time_on_asset(course_id, force_recompute=False, use_dataset_latest=False, end_date=None, just_do_totals=False, limit_query_size=False, table_max_size_mb=800, skip_totals=False, start_date=None, config_parameter_overrides=None): ''' Create the time_on_asset_daily table, containing module_id, username, date, and time on asset stats. This table has separate rows for each date, and is ordered in time. To update it, a new day's logs are processed, then the results appended to this table. If the table doesn't exist, then run it once on all the existing tracking logs. If it already exists, then run a query on it to see what dates have already been done. Then do all tracking logs except those which have already been done. Append the results to the existing table. Compute totals and store in time_on_asset_totals, by summing over all dates, grouped by module_id. How are time_on_asset numbers computed? See discussion in make_time_on_task.py The time_one_asset_daily table has these columns: - date: gives day for the data - username - module_id - time_umid5: total time on module (by module_id) in seconds, with a 5-minute timeout - time_umid30: total time on module (by module_id) in seconds, with a 30-minute timeout ''' if just_do_totals: return process_time_on_task_totals(course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest) SQL = """ SELECT "{course_id}" as course_id, date(time) as date, username, module_id, # time_umid5 = total time on module (by module_id) in seconds # time_mid5 has 5 minute timeout, time_mid30 has 30 min timeout SUM( case when dt_umid < 5*60 then dt_umid end ) as time_umid5, SUM( case when dt_umid < 30*60 then dt_umid end ) as time_umid30, FROM ( SELECT time, username, module_id, (time - last_time)/1.0E6 as dt, # dt is in seconds (time - last_time_umid)/1.0E6 as dt_umid, # dt for (user, module_id) in seconds last_time_umid, FROM ( SELECT time, username, last_username, module_id, USEC_TO_TIMESTAMP(last_time) as last_time, USEC_TO_TIMESTAMP(last_time_umid) as last_time_umid, FROM ( SELECT time, username, module_id, lag(time, 1) over (partition by username order by time) last_time, lag(username, 1) over (partition by username order by time) last_username, lag(time, 1) over (partition by username, module_id order by time) last_time_umid, FROM (SELECT time, username, (case when REGEXP_MATCH(module_id, r'.*\"\}}$') then REGEXP_EXTRACT(module_id, r'(.*)\"\}}$') when REGEXP_MATCH(module_id, r'.*\"\]\}}\}}$') then REGEXP_EXTRACT(module_id, r'(.*)\"\]\}}\}}$') else module_id end) as module_id, # fix some errors in module_id names FROM {DATASETS} ) WHERE module_id is not null AND username is not null AND username != "" and time > TIMESTAMP("{last_date}") ) ) ) WHERE module_id is not null AND NOT module_id CONTAINS '"' GROUP BY date, module_id, username ORDER BY date, module_id, username """ table = 'time_on_asset_daily' def gdf(row): return datetime.datetime.strptime(row['date'], '%Y-%m-%d') process_tracking_logs.run_query_on_tracking_logs(SQL, table, course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest, end_date=end_date, start_date=start_date, get_date_function=gdf, days_delta=0, has_hash_limit=True, newer_than=datetime.datetime(2015,3,15), # schema change table_max_size_mb=table_max_size_mb, limit_query_size=limit_query_size) if not skip_totals: return process_time_on_asset_totals(course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest) return
def CreateForumEvents(course_id, force_recompute=False, use_dataset_latest=False, skip_last_day=False, end_date=None): ''' Create forum events table, based on tracking logs. Extracts all forum-related events, including forum post reads, into the date-time ordered table. Repeated calls to this procedure will append new events to the table. If no new events are found, the existing table is left unchanged. ''' dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest) logs = bqutil.course_id2dataset(course_id, dtype='logs') table = TABLE_FORUM_EVENTS # event_type for forums may be like: # /courses/UnivX/123.4x/2T2015/discussion/forum/The_Subject/threads/5460c918a2a525003a0007fa # /courses/UnivX/123.4x/2T2015/discussion/forum/The_Subject/inline # /courses/UnivX/123.4x/2T2015/discussion/forum/users/4051854/followed # /courses/UnivX/123.4x/2T2015/discussion/comments/54593f21a2a525003a000351/reply # /courses/UnivX/123.4x/2T2015/discussion/threads/545e4f5da2a5251aac000672/reply # /courses/UnivX/123.4x/2T2015/discussion/threads/545770e9dad66c17cd0001d5/upvote # /courses/UnivX/123.4x/2T2015/discussion/threads/545770e9dad66c17cd0001d5/unvote # /courses/UnivX/123.4x/2T2015/discussion/threads/5447c22e892b213c7b0001f3/update # /courses/UnivX/123.4x/2T2015/discussion/threads/54493025892b2120a1000335/pin # /courses/UnivX/123.4x/2T2015/discussion/threads/54492e9c35c79cb03e00030c/delete # /courses/UnivX/123.4x/2T2015/discussion/forum/General/inline # /courses/UnivX/123.4x/2T2015/instructor/api/list_forum_members # /courses/UnivX/123.4x/2T2015/instructor/api/update_forum_role_membership # \"GET\": {\"action\": [\"allow\"], \"rolename\": [\"Administrator\"], \"unique_student_identifier\": [\"NEW_ADMIN_USER\"]}}"} # # module_id will be like: # "module_id": "UnivX/123.4x/forum/54492f0c892b21597e00030a" the_sql = """ SELECT time, username, '{course_id}' as course_id, (case when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/threads/[^/]+/reply') then "reply" when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/threads/[^/]+/upvote') then "upvote" when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/threads/[^/]+/unvote') then "unvote" when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/threads/[^/]+/update') then "update" when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/threads/[^/]+/delete') then "delete" when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/threads/[^/]+/close') then "close" when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/threads/[^/]+/follow') then "follow_thread" when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/threads/[^/]+/unfollow') then "unfollow_thread" when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/threads/[^/]+/pin') then "pin" when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/threads/[^/]+/unpin') then "unpin" when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/threads/[^/]+/downvote') then "downvote" # does this happen? when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/comments/[^/]+/reply') then "comment_reply" when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/comments/[^/]+/upvote') then "comment_upvote" when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/comments/[^/]+/update') then "comment_update" when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/comments/[^/]+/unvote') then "comment_unvote" when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/comments/[^/]+/delete') then "comment_delete" when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/forum/users/[^/]+/followed') then "follow_user" when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/forum/users/[^/]+$') then "target_user" when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/forum/[^/]+/threads/[^/]+') then "read" when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/forum/[^/]+/inline') then "read_inline" when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/forum/search') then "search" when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/forum$') then "enter_forum" when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/forum/$') then "enter_forum" when REGEXP_MATCH(event_type, r'/courses/(.*)/instructor/api/(.*)') then REGEXP_EXTRACT(event_type, r'/courses/.*/instructor/api/(.*)') when event_type = "edx.forum.thread.created" then "created_thread" when event_type = "edx.forum.response.created" then "created_response" when event_type = "edx.forum.comment.created" then "created_comment" when event_type = "edx.forum.searched" then "searched" else event_type end) as forum_action, (case when module_id is not null then REGEXP_EXTRACT(module_id, r'[^/]+/[^/]+/forum/([^/]+)') # For old-school courses with transparent course ids else (case when module_id is null # otherwise, for new opaque course ids, use regex to find thread_id from event_type, since module_id is null then (case when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/forum/[^/]+/threads/[^/]+') then REGEXP_EXTRACT(event_type, r'/courses/.*/discussion/forum/[^/]+/threads/([^/]+)') # read else (case when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/threads/[^/]+/') then REGEXP_EXTRACT(event_type, r'/courses/.*/discussion/threads/([^/]+)') # upvote, pinned, upvoted, unvoted, deleted, followed else REGEXP_EXTRACT(event_type, r'/courses/.*/discussion/comments/([^/]+)/') end) # comment end) end) end) as thread_id, REGEXP_EXTRACT(event_type, r'/courses/.*/forum/([^/]+)/') as subject, REGEXP_EXTRACT(event_type, r'/courses/.*/forum/users/([^/]+)') as target_user_id, event_struct.query as search_query, # unavailable before June 1, 2015 event_struct.GET as event_GET, # unavailable before June 1, 2015 FROM {DATASETS} WHERE (REGEXP_MATCH(event_type ,r'^edx\.forum\..*') or event_type contains "/discussion/forum" or event_type contains "/discussion/threads" or event_type contains "/discussion/comments" or event_type contains "list-forum-" or event_type contains "list_forum_" or event_type contains "add-forum-" or event_type contains "add_forum_" or event_type contains "remove-forum-" or event_type contains "remove_forum_" or event_type contains "update_forum_" ) AND username is not null AND event is not null and time > TIMESTAMP("{last_date}") {hash_limit} order by time """ try: tinfo = bqutil.get_bq_table_info(dataset, table) assert tinfo is not None, "[make_forum_analysis] Creating %s.%s table for %s" % ( dataset, table, course_id) print "[make_forum_analysis] Appending latest data to %s.%s table for %s" % ( dataset, table, course_id) sys.stdout.flush() except (AssertionError, Exception) as err: print str(err) sys.stdout.flush() print " --> Missing %s.%s? Attempting to create..." % (dataset, table) sys.stdout.flush() pass print "=== Processing Forum Events for %s (start %s)" % ( course_id, datetime.datetime.now()) sys.stdout.flush() def gdf(row): return datetime.datetime.utcfromtimestamp(float(row['time'])) process_tracking_logs.run_query_on_tracking_logs( the_sql, table, course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest, get_date_function=gdf, has_hash_limit=True, end_date=end_date, skip_last_day=skip_last_day) print "Done with Forum Events for %s (end %s)" % (course_id, datetime.datetime.now()) print "=" * 77 sys.stdout.flush()
def ExtractProblemEvents( course_id, force_recompute=False, use_dataset_latest=False, skip_last_day=False, end_date=None): dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest) table = TABLE_PROBLEM_EVENTS the_sql = """ SELECT context.user_id as user_id, time, event_source, REGEXP_EXTRACT( (CASE when module_id is not null then module_id when event_type contains "/xblock/i4x:;_" then REPLACE(REGEXP_EXTRACT(event_type, r"i4x:;_;_(.*)/handler/xmodule"),";_", "/") else REPLACE(event_struct.problem, "i4x://", "") end), "[^/]+/problem/([^/]+)") as problem_url, (CASE when event_type contains "/xblock/i4x:;_" then REGEXP_EXTRACT(event_type, r"xmodule_handler/(.[^/]+)") when event_type contains "type@problem+block" then REGEXP_EXTRACT(event_type, r"xmodule_handler/(.[^/]+)") else event_type end) as event_type, event_struct.attempts as attempts, event_struct.success as success, event_struct.grade as grade, FROM {DATASETS} WHERE ( REGEXP_MATCH(event_type, r'problem_\w+') OR event_type = "showanswer" ) AND context.user_id is not null and time > TIMESTAMP("{last_date}") {hash_limit} order by user_id, time """ try: bqutil.create_dataset_if_nonexistent(dataset) tinfo = bqutil.get_bq_table_info(dataset, table ) assert tinfo is not None, "[make_problem_events] Creating %s.%s table for %s" % (dataset, table, course_id) print "[make_problem_events] Appending latest data to %s.%s table for %s" % (dataset, table, course_id) sys.stdout.flush() except (AssertionError, Exception) as err: print str(err) sys.stdout.flush() print " --> Missing %s.%s? Attempting to create..." % ( dataset, table ) sys.stdout.flush() pass print "=== Processing Forum Events for %s (start %s)" % (course_id, datetime.datetime.now()) sys.stdout.flush() def gdf(row): return datetime.datetime.utcfromtimestamp(float(row['time'])) process_tracking_logs.run_query_on_tracking_logs(the_sql, table, course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest, get_date_function=gdf, has_hash_limit=True, end_date=end_date, skip_last_day=skip_last_day ) print "Done with Problem Events for %s (end %s)" % (course_id, datetime.datetime.now()) print "="*77 sys.stdout.flush()
def ExtractProblemEvents( course_id, force_recompute=False, use_dataset_latest=False, skip_last_day=False, end_date=None): dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest) table = TABLE_PROBLEM_EVENTS the_sql = """ SELECT context.user_id as user_id, time, event_source, REGEXP_EXTRACT( (CASE when module_id is not null then module_id when event_type contains "/xblock/i4x:;_" then REPLACE(REGEXP_EXTRACT(event_type, r"i4x:;_;_(.*)/handler/xmodule"),";_", "/") else REPLACE(event_struct.problem, "i4x://", "") end), "[^/]+/problem/([^/]+)") as problem_url, (CASE when event_type contains "/xblock/i4x:;_" then REGEXP_EXTRACT(event_type, r"xmodule_handler/(.[^/]+)") when event_type contains "type@problem+block" then REGEXP_EXTRACT(event_type, r"xmodule_handler/(.[^/]+)") else event_type end) as event_type, event_struct.attempts as attempts, event_struct.success as success, event_struct.grade as grade, FROM {DATASETS} WHERE ( REGEXP_MATCH(event_type, r'problem_\w+') OR event_type = "showanswer" ) AND context.user_id is not null and time > TIMESTAMP("{last_date}") {hash_limit} order by user_id, time """ try: tinfo = bqutil.get_bq_table_info(dataset, table ) assert tinfo is not None, "[make_problem_events] Creating %s.%s table for %s" % (dataset, table, course_id) print "[make_problem_events] Appending latest data to %s.%s table for %s" % (dataset, table, course_id) sys.stdout.flush() except (AssertionError, Exception) as err: print str(err) sys.stdout.flush() print " --> Missing %s.%s? Attempting to create..." % ( dataset, table ) sys.stdout.flush() pass print "=== Processing Forum Events for %s (start %s)" % (course_id, datetime.datetime.now()) sys.stdout.flush() def gdf(row): return datetime.datetime.utcfromtimestamp(float(row['time'])) process_tracking_logs.run_query_on_tracking_logs(the_sql, table, course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest, get_date_function=gdf, has_hash_limit=True, end_date=end_date, skip_last_day=skip_last_day ) print "Done with Problem Events for %s (end %s)" % (course_id, datetime.datetime.now()) print "="*77 sys.stdout.flush()
def process_course_time_on_task(course_id, force_recompute=False, use_dataset_latest=False, end_date=None, just_do_totals=False, limit_query_size=False, table_max_size_mb=800, skip_totals=False, start_date=None): ''' Create the time_on_task table, containing time, user_id, and time on task stats. This table isn't split into separate days. It is ordered in time, however. To update it, a new day's logs are processed, then the results appended to this table. If the table doesn't exist, then run it once on all the existing tracking logs. If it already exists, then run a query on it to see what dates have already been done. Then do all tracking logs except those which have already been done. Append the results to the existing table. Compute totals and store in time_on_task_totals, by summing over all dates, grouped by user. ''' if just_do_totals: return process_time_on_task_totals(course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest) SQL = """ SELECT "{course_id}" as course_id, date(time) as date, username, # total time spent on system SUM( case when dt < 5*60 then dt end ) as total_time_5, SUM( case when dt < 30*60 then dt end ) as total_time_30, # total time spent watching videos SUM( case when (dt_video is not null) and (dt_video < 5*60) then dt_video end ) as total_video_time_5, SUM( case when (dt_video is not null) and (dt_video < 30*60) then dt_video end ) as total_video_time_30, SUM( case when (serial_dt_video is not null) and (serial_dt_video < 30*60) then serial_dt_video end ) as serial_video_time_30, # total time spent doing problems SUM( case when (dt_problem is not null) and (dt_problem < 5*60) then dt_problem end ) as total_problem_time_5, SUM( case when (dt_problem is not null) and (dt_problem < 30*60) then dt_problem end ) as total_problem_time_30, SUM( case when (serial_dt_problem is not null) and (serial_dt_problem < 30*60) then serial_dt_problem end ) as serial_problem_time_30, # total time spent on forum SUM( case when (dt_forum is not null) and (dt_forum < 5*60) then dt_forum end ) as total_forum_time_5, SUM( case when (dt_forum is not null) and (dt_forum < 30*60) then dt_forum end ) as total_forum_time_30, SUM( case when (serial_dt_forum is not null) and (serial_dt_forum < 30*60) then serial_dt_forum end ) as serial_forum_time_30, # total time spent with textbook or wiki SUM( case when (dt_text is not null) and (dt_text < 5*60) then dt_text end ) as total_text_time_5, SUM( case when (dt_text is not null) and (dt_text < 30*60) then dt_text end ) as total_text_time_30, SUM( case when (serial_dt_text is not null) and (serial_dt_text < 30*60) then serial_dt_text end ) as serial_text_time_30, FROM ( SELECT time, username, (time - last_time)/1.0E6 as dt, # dt is in seconds case when is_video then (time - last_time_video)/1.0E6 end as dt_video, case when is_problem then (time - last_time_problem)/1.0E6 end as dt_problem, case when is_forum then (time - last_time_forum)/1.0E6 end as dt_forum, case when is_text then (time - last_time_text)/1.0E6 end as dt_text, case when is_video then (time - last_time_xevent)/1.0E6 end as serial_dt_video, case when is_problem then (time - last_time_xevent)/1.0E6 end as serial_dt_problem, case when is_forum then (time - last_time_xevent)/1.0E6 end as serial_dt_forum, case when is_text then (time - last_time_xevent)/1.0E6 end as serial_dt_text, FROM ( SELECT time, username, last_username, USEC_TO_TIMESTAMP(last_time) as last_time, (case when is_video then USEC_TO_TIMESTAMP(last_time_video) end) as last_time_video, # last_username_video, # last_event_video, is_problem, is_video, (case when is_problem then USEC_TO_TIMESTAMP(last_time_problem) end) as last_time_problem, # last_username_problem, # last_event_problem, is_forum, is_text, (case when is_forum then USEC_TO_TIMESTAMP(last_time_forum) end) as last_time_forum, (case when is_text then USEC_TO_TIMESTAMP(last_time_text) end) as last_time_text, is_xevent, (case when is_xevent then USEC_TO_TIMESTAMP(last_time_xevent) end) as last_time_xevent, FROM (SELECT time, username, lag(time, 1) over (partition by username order by time) last_time, lag(username, 1) over (partition by username order by time) last_username, is_video, is_problem, is_forum, is_text, (is_video or is_problem or is_forum or is_text) as is_xevent, # x = video, problem, forum, or text: any event case when is_problem then username else '' end as uname_problem, case when is_video then username else '' end as uname_video, case when is_forum then username else '' end as uname_forum, case when is_text then username else '' end as uname_text, case when (is_video or is_problem or is_forum or is_text) then username else '' end as uname_xevent, lag(time, 1) over (partition by uname_video order by time) last_time_video, # lag(event_type, 1) over (partition by uname_video order by time) last_event_video, # lag(uname_video, 1) over (partition by uname_video order by time) last_username_video, lag(time, 1) over (partition by uname_problem order by time) last_time_problem, # lag(event_type, 1) over (partition by uname_problem order by time) last_event_problem, # lag(uname_problem, 1) over (partition by uname_problem order by time) last_username_problem, lag(time, 1) over (partition by uname_forum order by time) last_time_forum, lag(time, 1) over (partition by uname_text order by time) last_time_text, lag(time, 1) over (partition by uname_xevent order by time) last_time_xevent, FROM (SELECT time, username, event_type, case when (REGEXP_MATCH(event_type ,r'\w+_video') or REGEXP_MATCH(event_type, r'\w+_transcript') ) then True else False end as is_video, case when REGEXP_MATCH(event_type, r'problem_\w+') then True else False end as is_problem, case when (REGEXP_MATCH(event_type ,r'^edx\.forum\..*') or event_type contains "/discussion/forum" ) then True else False end as is_forum, case when (REGEXP_MATCH(event_type ,r'^textbook\..*') or event_type contains "/wiki/" ) then True else False end as is_text, FROM {DATASETS} WHERE NOT event_type contains "/xblock/" AND username is not null AND username != "" and time > TIMESTAMP("{last_date}") {hash_limit} ) ) WHERE last_time is not null ORDER BY username, time ) ) group by course_id, username, date order by date, username; """ table = 'time_on_task' def gdf(row): return datetime.datetime.strptime(row['date'], '%Y-%m-%d') process_tracking_logs.run_query_on_tracking_logs(SQL, table, course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest, end_date=end_date, start_date=start_date, get_date_function=gdf, days_delta=0, has_hash_limit=True, newer_than=datetime.datetime(2015,3,15), # schema change table_max_size_mb=table_max_size_mb, limit_query_size=limit_query_size) if not skip_totals: return process_time_on_task_totals(course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest) return
def process_course(course_id, force_recompute=False, use_dataset_latest=False, end_date=None): ''' Create one enrollday_all table, containing time, user_id, and enrollment stats. Just as is done for problem_check tables, the master table holds all the events extracted from the tracking logs for a course. It isn't split into separate days. It is ordered in time, however. To update it, a new day's logs are processed, then the results appended to this table. If the enrollday_all table doesn't exist, then run it once on all the existing tracking logs. If it already exists, then run a query on it to see what dates have already been done. Then do all tracking logs except those which have already been done. Append the results to the existing table. ''' SQL = """ SELECT "{course_id}" as course_id, time, event_struct.user_id as user_id, (case when (event_type = "edx.course.enrollment.activated" and event_struct.mode = "honor") then 1 when (event_type = "edx.course.enrollment.deactivated" and event_struct.mode = "honor") then -1 else 0 end) as diff_enrollment_honor, (case when (event_type = "edx.course.enrollment.activated" and event_struct.mode = "verified") then 1 when (event_type = "edx.course.enrollment.deactivated" and event_struct.mode = "verified") then -1 when (event_type = "edx.course.enrollment.mode_changed" and event_struct.mode = "verified") then 1 when (event_type = "edx.course.enrollment.mode_changed" and event_struct.mode = "honor") then -1 else 0 end) as diff_enrollment_verified, (case when (event_type = "edx.course.enrollment.activated" and event_struct.mode = "audit") then 1 when (event_type = "edx.course.enrollment.deactivated" and event_struct.mode = "audit") then -1 else 0 end) as diff_enrollment_audit, FROM {DATASETS} where (event_type = "edx.course.enrollment.activated") or (event_type = "edx.course.enrollment.deactivated") or (event_type = "edx.course.enrollment.mode_changed") and time > TIMESTAMP("{last_date}") order by time; """ table = 'enrollday_all' def gdf(row): return datetime.datetime.utcfromtimestamp(float(row['time'])) process_tracking_logs.run_query_on_tracking_logs( SQL, table, course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest, end_date=end_date, get_date_function=gdf, days_delta=0)
def process_course(course_id, force_recompute=False, use_dataset_latest=False, end_date=None, check_dates=True, skip_last_day=False): ''' Make {course_id}.person_course_day table for specified course_id. This is a single course-specific table, which contains all day's data. It is incrementally updated when new tracking logs data comes in, by appending rows to the end. The rows are kept in time order. check_dates is disregarded. If skip_last_day is True then do not include the last day of tracking log data in the processing. This is done to avoid processing partial data, e.g. when tracking log data are incrementally loaded with a delta of less than one day. ''' PCDAY_SQL = """ select username, "{course_id}" as course_id, date(time) as date, sum(bevent) as nevents, sum(bprogress) as nprogcheck, sum(bshow_answer) as nshow_answer, sum(bvideo) as nvideo, sum(bproblem_check) as nproblem_check, sum(bforum) as nforum, sum(bshow_transcript) as ntranscript, sum(bseq_goto) as nseq_goto, sum(bseek_video) as nseek_video, sum(bpause_video) as npause_video, MAX(time) as last_event, AVG( case when (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 > 5*60 then null else (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 end ) as avg_dt, STDDEV( case when (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 > 5*60 then null else (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 end ) as sdv_dt, MAX( case when (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 > 5*60 then null else (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 end ) as max_dt, COUNT( case when (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 > 5*60 then null else (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 end ) as n_dt, SUM( case when (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 > 5*60 then null else (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 end ) as sum_dt from (SELECT username, case when event_type = "play_video" then 1 else 0 end as bvideo, case when event_type = "problem_check" then 1 else 0 end as bproblem_check, case when username != "" then 1 else 0 end as bevent, case when regexp_match(event_type, "^/courses/{course_id}/discussion/.*") then 1 else 0 end as bforum, case when regexp_match(event_type, "^/courses/{course_id}/progress") then 1 else 0 end as bprogress, case when event_type in ("show_answer", "showanswer") then 1 else 0 end as bshow_answer, case when event_type = 'show_transcript' then 1 else 0 end as bshow_transcript, case when event_type = 'seq_goto' then 1 else 0 end as bseq_goto, case when event_type = 'seek_video' then 1 else 0 end as bseek_video, case when event_type = 'pause_video' then 1 else 0 end as bpause_video, # case when event_type = 'edx.course.enrollment.activated' then 1 else 0 end as benroll, # case when event_type = 'edx.course.enrollment.deactivated' then 1 else 0 end as bunenroll time, lag(time, 1) over (partition by username order by time) last_time FROM {DATASETS} WHERE NOT event_type contains "/xblock/" AND username != "" ) group by course_id, username, date order by date """ table = 'person_course_day' def gdf(row): return datetime.datetime.strptime(row['date'], '%Y-%m-%d') print "=== Processing person_course_day for %s (start %s)" % (course_id, datetime.datetime.now()) sys.stdout.flush() process_tracking_logs.run_query_on_tracking_logs(PCDAY_SQL, table, course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest, end_date=end_date, get_date_function=gdf, skip_last_day=skip_last_day) print "Done with person_course_day for %s (end %s)" % (course_id, datetime.datetime.now()) print "="*77 sys.stdout.flush()
def process_course_time_on_task(course_id, force_recompute=False, use_dataset_latest=False, end_date=None, just_do_totals=False, limit_query_size=False, table_max_size_mb=800, skip_totals=False, start_date=None, config_parameter_overrides=None): '''Create the time_on_task table, containing time, user_id, and time on task stats. This table isn't split into separate days. It is ordered in time, however. To update it, a new day's logs are processed, then the results appended to this table. If the table doesn't exist, then run it once on all the existing tracking logs. If it already exists, then run a query on it to see what dates have already been done. Then do all tracking logs except those which have already been done. Append the results to the existing table. Compute totals and store in time_on_task_totals, by summing over all dates, grouped by user. config_parameter_overrides may be provided as a dict, which overrides the default timeout parameters (5 min and 30 min) and output table name; used for bounday analysis testing. How are time_on_task numbers computed? Let TO be the timeout (eg 5 min or 30 min). For all time-ordered pairs of events, at times t1 and t2, with dt = t2-t1, the total time is the sum of all dt such that dt <= TO. Solo (unpaired) events do not cause addition of any time to the total. For video (or text or problems), events are filtered and only include events of those types. This methodology undercounts, for example, if the learner ends watching a video by closing the browser, or by switching elsewhere without stopping the video player first. (note there does exist a stop_video event, which occurs when the video stop button is clicked). This methodology overcounts, for example, if the learner watches a video for awhile, switches to a problem, then back to the video. If the initial and final video intervals happen within TO, then the intermediate interval is counted towards the video total time, despite the learner actually doing something else (the problem) during that time. ''' if just_do_totals: return process_time_on_task_totals( course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest, config_parameter_overrides=config_parameter_overrides) config_parameters = { 'timeout_short': 5, 'timeout_long': 30, 'time_on_task_table_name': 'time_on_task', 'course_id': course_id, } config_parameters.update(config_parameter_overrides or {}) SQL_TOP = """ SELECT "{course_id}" as course_id, date(time) as date, username, # total time spent on system SUM( case when dt < {timeout_short}*60 then dt end ) as total_time_{timeout_short}, SUM( case when dt < {timeout_long}*60 then dt end ) as total_time_{timeout_long}, # total time spent watching videos SUM( case when (dt_video is not null) and (dt_video < {timeout_short}*60) then dt_video end ) as total_video_time_{timeout_short}, SUM( case when (dt_video is not null) and (dt_video < {timeout_long}*60) then dt_video end ) as total_video_time_{timeout_long}, SUM( case when (serial_dt_video is not null) and (serial_dt_video < {timeout_long}*60) then serial_dt_video end ) as serial_video_time_{timeout_long}, # total time spent doing problems SUM( case when (dt_problem is not null) and (dt_problem < {timeout_short}*60) then dt_problem end ) as total_problem_time_{timeout_short}, SUM( case when (dt_problem is not null) and (dt_problem < {timeout_long}*60) then dt_problem end ) as total_problem_time_{timeout_long}, SUM( case when (serial_dt_problem is not null) and (serial_dt_problem < {timeout_long}*60) then serial_dt_problem end ) as serial_problem_time_{timeout_long}, # total time spent on forum SUM( case when (dt_forum is not null) and (dt_forum < {timeout_short}*60) then dt_forum end ) as total_forum_time_{timeout_short}, SUM( case when (dt_forum is not null) and (dt_forum < {timeout_long}*60) then dt_forum end ) as total_forum_time_{timeout_long}, SUM( case when (serial_dt_forum is not null) and (serial_dt_forum < {timeout_long}*60) then serial_dt_forum end ) as serial_forum_time_{timeout_long}, # total time spent with textbook or wiki SUM( case when (dt_text is not null) and (dt_text < {timeout_short}*60) then dt_text end ) as total_text_time_{timeout_short}, SUM( case when (dt_text is not null) and (dt_text < {timeout_long}*60) then dt_text end ) as total_text_time_{timeout_long}, SUM( case when (serial_dt_text is not null) and (serial_dt_text < {timeout_long}*60) then serial_dt_text end ) as serial_text_time_{timeout_long}, """ SQL_BOT = """ FROM ( SELECT time, username, (time - last_time)/1.0E6 as dt, # dt is in seconds case when is_video then (time - last_time_video)/1.0E6 end as dt_video, case when is_problem then (time - last_time_problem)/1.0E6 end as dt_problem, case when is_forum then (time - last_time_forum)/1.0E6 end as dt_forum, case when is_text then (time - last_time_text)/1.0E6 end as dt_text, case when is_video then (time - last_time_xevent)/1.0E6 end as serial_dt_video, case when is_problem then (time - last_time_xevent)/1.0E6 end as serial_dt_problem, case when is_forum then (time - last_time_xevent)/1.0E6 end as serial_dt_forum, case when is_text then (time - last_time_xevent)/1.0E6 end as serial_dt_text, FROM ( SELECT time, username, last_username, USEC_TO_TIMESTAMP(last_time) as last_time, (case when is_video then USEC_TO_TIMESTAMP(last_time_video) end) as last_time_video, # last_username_video, # last_event_video, is_problem, is_video, (case when is_problem then USEC_TO_TIMESTAMP(last_time_problem) end) as last_time_problem, # last_username_problem, # last_event_problem, is_forum, is_text, (case when is_forum then USEC_TO_TIMESTAMP(last_time_forum) end) as last_time_forum, (case when is_text then USEC_TO_TIMESTAMP(last_time_text) end) as last_time_text, is_xevent, (case when is_xevent then USEC_TO_TIMESTAMP(last_time_xevent) end) as last_time_xevent, FROM (SELECT time, username, lag(time, 1) over (partition by username order by time) last_time, lag(username, 1) over (partition by username order by time) last_username, is_video, is_problem, is_forum, is_text, (is_video or is_problem or is_forum or is_text) as is_xevent, # x = video, problem, forum, or text: any event case when is_problem then username else '' end as uname_problem, case when is_video then username else '' end as uname_video, case when is_forum then username else '' end as uname_forum, case when is_text then username else '' end as uname_text, case when (is_video or is_problem or is_forum or is_text) then username else '' end as uname_xevent, lag(time, 1) over (partition by uname_video order by time) last_time_video, # lag(event_type, 1) over (partition by uname_video order by time) last_event_video, # lag(uname_video, 1) over (partition by uname_video order by time) last_username_video, lag(time, 1) over (partition by uname_problem order by time) last_time_problem, # lag(event_type, 1) over (partition by uname_problem order by time) last_event_problem, # lag(uname_problem, 1) over (partition by uname_problem order by time) last_username_problem, lag(time, 1) over (partition by uname_forum order by time) last_time_forum, lag(time, 1) over (partition by uname_text order by time) last_time_text, lag(time, 1) over (partition by uname_xevent order by time) last_time_xevent, FROM (SELECT time, username, event_type, case when (REGEXP_MATCH(event_type ,r'\w+_video') or REGEXP_MATCH(event_type, r'\w+_transcript') ) then True else False end as is_video, case when REGEXP_MATCH(event_type, r'problem_\w+') then True else False end as is_problem, case when (REGEXP_MATCH(event_type ,r'^edx\.forum\..*') or event_type contains "/discussion/forum" or event_type contains "/discussion/threads" or event_type contains "/discussion/comments" ) then True else False end as is_forum, case when (REGEXP_MATCH(event_type ,r'^textbook\..*') or event_type contains "/wiki/" ) then True else False end as is_text, FROM {DATASETS} WHERE NOT event_type contains "/xblock/" AND username is not null AND username != "" and time > TIMESTAMP("{last_date}") {hash_limit} ) ) WHERE last_time is not null ORDER BY username, time ) ) group by course_id, username, date order by date, username; """ SQL = SQL_TOP.format(**config_parameters) + SQL_BOT table = config_parameters.get('time_on_task_table_name') or 'time_on_task' def gdf(row): return datetime.datetime.strptime(row['date'], '%Y-%m-%d') process_tracking_logs.run_query_on_tracking_logs( SQL, table, course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest, end_date=end_date, start_date=start_date, get_date_function=gdf, days_delta=0, has_hash_limit=True, newer_than=datetime.datetime(2015, 3, 15), # schema change table_max_size_mb=table_max_size_mb, limit_query_size=limit_query_size) if not skip_totals: return process_time_on_task_totals( course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest, config_parameter_overrides=config_parameter_overrides) return
def process_course_time_on_asset(course_id, force_recompute=False, use_dataset_latest=False, end_date=None, just_do_totals=False, limit_query_size=False, table_max_size_mb=800, skip_totals=False, start_date=None, config_parameter_overrides=None): ''' Create the time_on_asset_daily table, containing module_id, username, date, and time on asset stats. This table has separate rows for each date, and is ordered in time. To update it, a new day's logs are processed, then the results appended to this table. If the table doesn't exist, then run it once on all the existing tracking logs. If it already exists, then run a query on it to see what dates have already been done. Then do all tracking logs except those which have already been done. Append the results to the existing table. Compute totals and store in time_on_asset_totals, by summing over all dates, grouped by module_id. How are time_on_asset numbers computed? See discussion in make_time_on_task.py The time_one_asset_daily table has these columns: - date: gives day for the data - username - module_id - time_umid5: total time on module (by module_id) in seconds, with a 5-minute timeout - time_umid30: total time on module (by module_id) in seconds, with a 30-minute timeout ''' if just_do_totals: return process_time_on_task_totals( course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest) SQL = """ SELECT "{course_id}" as course_id, date(time) as date, username, module_id, # time_umid5 = total time on module (by module_id) in seconds # time_mid5 has 5 minute timeout, time_mid30 has 30 min timeout SUM( case when dt_umid < 5*60 then dt_umid end ) as time_umid5, SUM( case when dt_umid < 30*60 then dt_umid end ) as time_umid30, FROM ( SELECT time, username, module_id, (time - last_time)/1.0E6 as dt, # dt is in seconds (time - last_time_umid)/1.0E6 as dt_umid, # dt for (user, module_id) in seconds last_time_umid, FROM ( SELECT time, username, last_username, module_id, USEC_TO_TIMESTAMP(last_time) as last_time, USEC_TO_TIMESTAMP(last_time_umid) as last_time_umid, FROM ( SELECT time, username, module_id, lag(time, 1) over (partition by username order by time) last_time, lag(username, 1) over (partition by username order by time) last_username, lag(time, 1) over (partition by username, module_id order by time) last_time_umid, FROM (SELECT time, username, (case when REGEXP_MATCH(module_id, r'.*\"\}}$') then REGEXP_EXTRACT(module_id, r'(.*)\"\}}$') when REGEXP_MATCH(module_id, r'.*\"\]\}}\}}$') then REGEXP_EXTRACT(module_id, r'(.*)\"\]\}}\}}$') else module_id end) as module_id, # fix some errors in module_id names FROM {DATASETS} ) WHERE module_id is not null AND username is not null AND username != "" and time > TIMESTAMP("{last_date}") ) ) ) WHERE module_id is not null AND NOT module_id CONTAINS '"' GROUP BY date, module_id, username ORDER BY date, module_id, username """ table = 'time_on_asset_daily' def gdf(row): return datetime.datetime.strptime(row['date'], '%Y-%m-%d') process_tracking_logs.run_query_on_tracking_logs( SQL, table, course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest, end_date=end_date, start_date=start_date, get_date_function=gdf, days_delta=0, has_hash_limit=True, newer_than=datetime.datetime(2015, 3, 15), # schema change table_max_size_mb=table_max_size_mb, limit_query_size=limit_query_size) if not skip_totals: return process_time_on_asset_totals( course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest) return
def compute_person_course_day_trlang_table(course_id, force_recompute=False, use_dataset_latest=False, end_date=None): ''' make pcday_trlang_counts for specified course_id This couse table holds all the (username, course_id, date, transcript_language, count) data for a course. This is used in computing the modal video transcript language of users ''' table = 'pcday_trlang_counts' SQL = ''' SELECT username, course_id, DATE(time) AS date, LAST(time) AS last_time, resource, CASE WHEN resource_event_type = 'transcript_download' AND prev_event_data IS NOT NULL THEN prev_event_data WHEN resource_event_type = 'transcript_download' AND prev_event_data IS NULL THEN 'en' WHEN resource_event_type = 'transcript_language' THEN resource_event_data ELSE resource_event_data END AS resource_event_data, resource_event_type, SUM(lang_count) AS langcount FROM ( SELECT course_id, username, resource, resource_event_data, resource_event_type, LAG(time, 1) OVER (PARTITION BY username ORDER BY time) AS prev_time, LAG(resource_event_type, 1) OVER (PARTITION BY username ORDER BY time) AS prev_event_type, LAG(resource_event_data, 1) OVER (PARTITION BY username ORDER BY time) AS prev_event_data, time, COUNT(*) AS lang_count, event_type FROM ( SELECT time, course_id, username, 'video' AS resource, CASE WHEN module_id IS NOT NULL THEN REGEXP_EXTRACT(module_id, r'.*video/(.*)') # Newer data .. May 2016? ELSE REGEXP_EXTRACT(event_type, r'.*;_video;_(.*)\/handler\/transcript\/translation\/.*') END # Older data AS resource_id, CASE WHEN (event_type = 'edx.ui.lms.link_clicked' # Trasncript Download AND REGEXP_MATCH(JSON_EXTRACT(event, '$.target_url'), r'(.*handler/transcript/download)') ) THEN 'transcript_download' WHEN (REGEXP_MATCH(event_type, "/transcript/translation/.*") ) THEN 'transcript_language' ELSE NULL END AS resource_event_type, REGEXP_EXTRACT(event_type, r'.*\/handler\/transcript\/translation\/(.*)') AS resource_event_data, event_type, FROM {DATASETS} WHERE time > TIMESTAMP("2010-10-01 01:02:03") AND username != "" AND ( (NOT event_type CONTAINS "/xblock/" # Trasncript Download AND event_type = 'edx.ui.lms.link_clicked' # Trasncript Download AND REGEXP_MATCH(JSON_EXTRACT(event, '$.target_url'), r'(.*handler/transcript/download)') ) # Transcript Download OR (REGEXP_MATCH(event_type, "/transcript/translation/.*") ) # Transcript Language Toggle ) ORDER BY time ) GROUP BY username, course_id, resource, resource_event_data, resource_event_type, event_type, time ) GROUP BY username, course_id, date, resource, resource_event_data, resource_event_type, ORDER BY date ASC, username ASC ''' def gdf(row): return datetime.datetime.strptime(row['date'], '%Y-%m-%d') process_tracking_logs.run_query_on_tracking_logs(SQL, table, course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest, end_date=end_date, get_date_function=gdf)
def process_course(course_id, force_recompute=False, use_dataset_latest=False, end_date=None, check_dates=True, skip_last_day=False): ''' Make {course_id}.person_course_day table for specified course_id. This is a single course-specific table, which contains all day's data. It is incrementally updated when new tracking logs data comes in, by appending rows to the end. The rows are kept in time order. check_dates is disregarded. If skip_last_day is True then do not include the last day of tracking log data in the processing. This is done to avoid processing partial data, e.g. when tracking log data are incrementally loaded with a delta of less than one day. ''' dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest) videoTableExists = False try: tinfo_video = bqutil.get_bq_table_info(dataset, 'video_stats_day') assert tinfo_video is not None, "Video stats table missing... Not including video stats" videoTableExists = True except (AssertionError, Exception) as err: #print " --> Err: missing %s.%s? Skipping creation of chapter_grades" % (dataset, "course_axis") sys.stdout.flush() pass #return forumTableExists = False try: tinfo_forum = bqutil.get_bq_table_info(dataset, 'forum_events') assert tinfo_forum is not None, "Forum events table missing... Not including forum stats" forumTableExists = True except (AssertionError, Exception) as err: #print " --> Err: missing %s.%s? Skipping creation of chapter_grades" % (dataset, "course_axis") sys.stdout.flush() pass #return problemTableExists = False try: tinfo_personproblem = bqutil.get_bq_table_info(dataset, 'person_problem') tinfo_courseproblem = bqutil.get_bq_table_info(dataset, 'course_problem') tinfo_courseaxis = bqutil.get_bq_table_info(dataset, 'course_axis') tinfo_personcourse = bqutil.get_bq_table_info(dataset, 'person_course') # Check course axis and person course, course problem assert tinfo_personproblem is not None, "Person problem table missing... Not including problem stats" assert tinfo_courseproblem is not None, "Course problem table missing... Not including problem stats" assert tinfo_courseaxis is not None, "Course axis table missing... Not including problem stats" assert tinfo_personcourse is not None, "Person Course table missing... Not including problem stats" problemTableExists = True except (AssertionError, Exception) as err: #print " --> Err: missing %s.%s? Skipping creation of chapter_grades" % (dataset, "course_axis") sys.stdout.flush() pass PCDAY_SQL_BASE_SELECT = """ SELECT username, '{course_id}' AS course_id, DATE(time) AS date, SUM(bevent) AS nevents, SUM(bprogress) AS nprogcheck, SUM(bshow_answer) AS nshow_answer, SUM(bvideo) AS nvideo, SUM(bproblem_check) AS nproblem_check, SUM(bforum) AS nforum, SUM(bshow_transcript) AS ntranscript, SUM(bseq_goto) AS nseq_goto, SUM(bseek_video) AS nseek_video, SUM(bpause_video) AS npause_video, """ PCDAY_SQL_VIDEO_EXISTS = """ COUNT(DISTINCT video_id) AS nvideos_viewed, # New Video - Unique videos viewed SUM(case when position is not null then FLOAT(position) else FLOAT(0.0) end) AS nvideos_watched_sec, # New Video - # sec watched using max video position """ PCDAY_SQL_VIDEO_DNE = """ 0 AS nvideos_viewed, # New Video - Unique videos viewed FLOAT(0.0) AS nvideos_watched_sec, # New Video - # sec watched using max video position """ PCDAY_SQL_VIDEO_SELECT = PCDAY_SQL_VIDEO_EXISTS if videoTableExists else PCDAY_SQL_VIDEO_DNE PCDAY_SQL_FORUM_EXISTS = """ SUM(case when read is not null then read else 0 end) AS nforum_reads, # New discussion - Forum reads SUM(case when write is not null then write else 0 end) AS nforum_posts, # New discussion - Forum posts COUNT(DISTINCT thread_id ) AS nforum_threads, # New discussion - Unique forum threads interacted with """ PCDAY_SQL_FORUM_DNE = """ 0 AS nforum_reads, # New discussion - Forum reads 0 AS nforum_posts, # New discussion - Forum posts 0 AS nforum_threads, # New discussion - Unique forum threads interacted with """ PCDAY_SQL_FORUM_SELECT = PCDAY_SQL_FORUM_EXISTS if forumTableExists else PCDAY_SQL_FORUM_DNE PCDAY_SQL_PROBLEM_EXISTS = """ COUNT(DISTINCT problem_nid ) AS nproblems_answered, # New Problem - Unique problems attempted SUM(case when n_attempts is not null then n_attempts else 0 end) AS nproblems_attempted, # New Problem - Total attempts SUM(case when ncount_problem_multiplechoice is not null then ncount_problem_multiplechoice else 0 end) as nproblems_multiplechoice, SUM(case when ncount_problem_choice is not null then ncount_problem_choice else 0 end) as nproblems_choice, SUM(case when ncount_problem_numerical is not null then ncount_problem_numerical else 0 end) as nproblems_numerical, SUM(case when ncount_problem_option is not null then ncount_problem_option else 0 end) as nproblems_option, SUM(case when ncount_problem_custom is not null then ncount_problem_custom else 0 end) as nproblems_custom, SUM(case when ncount_problem_string is not null then ncount_problem_string else 0 end) as nproblems_string, SUM(case when ncount_problem_mixed is not null then ncount_problem_mixed else 0 end) as nproblems_mixed, SUM(case when ncount_problem_formula is not null then ncount_problem_formula else 0 end) as nproblems_forumula, SUM(case when ncount_problem_other is not null then ncount_problem_other else 0 end) as nproblems_other, """ PCDAY_SQL_PROBLEM_DNE = """ 0 AS nproblems_answered, # New Problem - Unique problems attempted 0 AS nproblems_attempted, # New Problem - Total attempts 0 AS nproblems_multiplechoice, 0 AS nproblems_choice, 0 AS nproblems_numerical, 0 AS nproblems_option, 0 AS nproblems_custom, 0 AS nproblems_string, 0 AS nproblems_mixed, 0 AS nproblems_forumula, 0 AS nproblems_other, """ PCDAY_SQL_PROBLEM_SELECT = PCDAY_SQL_PROBLEM_EXISTS if problemTableExists else PCDAY_SQL_PROBLEM_DNE PCDAY_SQL_MID = """ MIN(time) AS first_event, MAX(time) AS last_event, AVG( CASE WHEN (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 > 5*60 THEN NULL ELSE (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 END ) AS avg_dt, STDDEV( CASE WHEN (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 > 5*60 THEN NULL ELSE (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 END ) AS sdv_dt, MAX( CASE WHEN (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 > 5*60 THEN NULL ELSE (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 END ) AS max_dt, COUNT( CASE WHEN (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 > 5*60 THEN NULL ELSE (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 END ) AS n_dt, SUM( CASE WHEN (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 > 5*60 THEN NULL ELSE (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 END ) AS sum_dt FROM ( SELECT * FROM ( SELECT username, CASE WHEN event_type = "play_video" THEN 1 ELSE 0 END AS bvideo, CASE WHEN event_type = "problem_check" THEN 1 ELSE 0 END AS bproblem_check, CASE WHEN username != "" THEN 1 ELSE 0 END AS bevent, CASE WHEN REGEXP_MATCH(event_type, "^/courses/{course_id}/discussion/.*") then 1 else 0 end as bforum, CASE WHEN REGEXP_MATCH(event_type, "^/courses/{course_id}/progress") then 1 else 0 end as bprogress, CASE WHEN event_type IN ("show_answer", "showanswer") THEN 1 ELSE 0 END AS bshow_answer, CASE WHEN event_type = 'show_transcript' THEN 1 ELSE 0 END AS bshow_transcript, CASE WHEN event_type = 'seq_goto' THEN 1 ELSE 0 END AS bseq_goto, CASE WHEN event_type = 'seek_video' THEN 1 ELSE 0 END AS bseek_video, CASE WHEN event_type = 'pause_video' THEN 1 ELSE 0 END AS bpause_video, # case when event_type = 'edx.course.enrollment.activated' then 1 else 0 end as benroll, # case when event_type = 'edx.course.enrollment.deactivated' then 1 else 0 end as bunenroll time, LAG(time, 1) OVER (PARTITION BY username ORDER BY time) last_time FROM {DATASETS} WHERE NOT event_type CONTAINS "/xblock/" AND username != "" ) """ PCDAY_SQL_VIDEO = """ , ( # Video events SELECT TIMESTAMP(date) as time, '{course_id}' as course_id, username, video_id, position, FROM [{dataset}.video_stats_day] WHERE TIMESTAMP(date)>= TIMESTAMP("{min_date_start}") and TIMESTAMP(date) <= TIMESTAMP("{max_date_end}") ) """ PCDAY_SQL_ADD = PCDAY_SQL_VIDEO if videoTableExists else '' PCDAY_SQL_FORUM = """ , ( # Forum Events SELECT time, username, '{course_id}' as course_id, thread_id, (CASE WHEN (forum_action == "reply" or forum_action == "comment_reply" or forum_action == "created_thread" or forum_action == "created_response" or forum_action == "created_comment") THEN 1 ELSE 0 END) AS write, (CASE WHEN (forum_action == "read" or forum_action == "read_inline") THEN 1 ELSE 0 END) AS read, FROM [{dataset}.forum_events] WHERE (forum_action == "reply" or forum_action == "comment_reply" or forum_action == "created_thread" or forum_action == "created_response" or forum_action == "created_comment" or forum_action == "read" or forum_action == "read_inline") and ( time >= TIMESTAMP("{min_date_start}") and time <= TIMESTAMP("{max_date_end}") ) ) """ PCDAY_SQL_ADD = PCDAY_SQL_ADD + PCDAY_SQL_FORUM if forumTableExists else PCDAY_SQL_ADD PCDAY_SQL_PROBLEM = """, ( # Problems SELECT pc.username AS username, pp.problem_nid AS problem_nid, pp.n_attempts AS n_attempts, pp.time AS time, '{course_id}' as course_id, pp.ncount_problem_multiplechoice as ncount_problem_multiplechoice, pp.ncount_problem_choice as ncount_problem_choice, pp.ncount_problem_numerical as ncount_problem_numerical, pp.ncount_problem_option as ncount_problem_option, pp.ncount_problem_custom as ncount_problem_custom, pp.ncount_problem_string as ncount_problem_string, pp.ncount_problem_mixed as ncount_problem_mixed, pp.ncount_problem_formula as ncount_problem_formula, pp.ncount_problem_other as ncount_problem_other, FROM ( ( SELECT PP.user_id as user_id, PP.problem_nid AS problem_nid, PP.n_attempts as n_attempts, PP.date as time, (Case when CP_CA.data_itype == "multiplechoiceresponse" then 1 else 0 end) as ncount_problem_multiplechoice, # Choice (Case when CP_CA.data_itype == "choiceresponse" then 1 else 0 end) as ncount_problem_choice, # Choice (Case when CP_CA.data_itype == "numericalresponse" then 1 else 0 end) as ncount_problem_numerical, #input (Case when CP_CA.data_itype == "optionresponse" then 1 else 0 end) as ncount_problem_option, # Choice (Case when CP_CA.data_itype == "customresponse" then 1 else 0 end) as ncount_problem_custom, # Custom (Case when CP_CA.data_itype == "stringresponse" then 1 else 0 end) as ncount_problem_string, # Input (Case when CP_CA.data_itype == "mixed" then 1 else 0 end) as ncount_problem_mixed, # Mixed (Case when CP_CA.data_itype == "forumula" then 1 else 0 end) as ncount_problem_formula, # Input (Case when CP_CA.data_itype != "multiplechoiceresponse" and CP_CA.data_itype != "choiceresponse" and CP_CA.data_itype != "numericalresponse" and CP_CA.data_itype != "optionresponse" and CP_CA.data_itype != "customresponse" and CP_CA.data_itype != "stringresponse" and CP_CA.data_itype != "mixed" and CP_CA.data_itype != "forumula" then 1 else 0 end) as ncount_problem_other, # Input #MAX(n_attempts) AS n_attempts, #MAX(date) AS time, FROM [{dataset}.person_problem] PP LEFT JOIN ( SELECT CP.problem_nid as problem_nid, INTEGER(CP.problem_id) as problem_id, CA.data.itype as data_itype, FROM [{dataset}.course_problem] CP LEFT JOIN [{dataset}.course_axis] CA ON CP.problem_id == CA.url_name ) as CP_CA ON PP.problem_nid == CP_CA.problem_nid GROUP BY time, user_id, problem_nid, n_attempts, ncount_problem_multiplechoice, ncount_problem_choice, ncount_problem_choice, ncount_problem_numerical, ncount_problem_option, ncount_problem_custom, ncount_problem_string, ncount_problem_mixed, ncount_problem_formula, ncount_problem_other ) #FROM [{dataset}.person_item] PI #JOIN [{dataset}.course_item] CI #ON PI.item_nid = CI.item_nid #GROUP BY user_id, #problem_nid #ORDER BY #user_id, #problem_nid ) AS pp LEFT JOIN ( SELECT username, user_id FROM [{dataset}.person_course] ) AS pc ON pc.user_id = pp.user_id WHERE time >= TIMESTAMP("{min_date_start}") and time <= TIMESTAMP("{max_date_end}") ) """ PCDAY_SQL_ADD = PCDAY_SQL_ADD + PCDAY_SQL_PROBLEM if problemTableExists else PCDAY_SQL_ADD PCDAY_SQL_END = """ ) WHERE time > TIMESTAMP("{last_date}") GROUP BY course_id, username, date ORDER BY date """ PCDAY_SQL_NEW = PCDAY_SQL_BASE_SELECT + PCDAY_SQL_VIDEO_SELECT + PCDAY_SQL_FORUM_SELECT + PCDAY_SQL_PROBLEM_SELECT + PCDAY_SQL_MID + PCDAY_SQL_ADD + PCDAY_SQL_END PCDAY_SQL = PCDAY_SQL_NEW.format( dataset=dataset, course_id="{course_id}", DATASETS="{DATASETS}", last_date="{last_date}", min_date_start="{min_date_start}", max_date_end="{max_date_end}") table = 'person_course_day' def gdf(row): return datetime.datetime.strptime(row['date'], '%Y-%m-%d') print "=== Processing person_course_day for %s (start %s)" % (course_id, datetime.datetime.now()) sys.stdout.flush() # Major person_course_day schema revision 19-Jan-2016 adds new fields; if table exists, ensure it # has new schema, else force recompute. try: tinfo = bqutil.get_bq_table_info(dataset, table) except Exception as err: tinfo = None if tinfo: fields = tinfo['schema']['fields'] field_names = [x['name'] for x in fields] if not 'nvideos_viewed' in field_names: cdt = tinfo['creationTime'] print " --> person_course_day created %s; missing nvideos_viewed field in schema; forcing recompute - this may take a long time!" % cdt sys.stdout.flush() force_recompute = True process_tracking_logs.run_query_on_tracking_logs(PCDAY_SQL, table, course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest, end_date=end_date, get_date_function=gdf, newer_than=datetime.datetime( 2017, 2, 8, 16, 30 ), skip_last_day=skip_last_day) print "Done with person_course_day for %s (end %s)" % (course_id, datetime.datetime.now()) print "="*77 sys.stdout.flush()
def CreateForumEvents( course_id, force_recompute=False, use_dataset_latest=False, skip_last_day=False, end_date=None): ''' Create forum events table, based on tracking logs. Extracts all forum-related events, including forum post reads, into the date-time ordered table. Repeated calls to this procedure will append new events to the table. If no new events are found, the existing table is left unchanged. ''' dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest) logs = bqutil.course_id2dataset(course_id, dtype='logs') table = TABLE_FORUM_EVENTS # event_type for forums may be like: # /courses/UnivX/123.4x/2T2015/discussion/forum/The_Subject/threads/5460c918a2a525003a0007fa # /courses/UnivX/123.4x/2T2015/discussion/forum/The_Subject/inline # /courses/UnivX/123.4x/2T2015/discussion/forum/users/4051854/followed # /courses/UnivX/123.4x/2T2015/discussion/comments/54593f21a2a525003a000351/reply # /courses/UnivX/123.4x/2T2015/discussion/threads/545e4f5da2a5251aac000672/reply # /courses/UnivX/123.4x/2T2015/discussion/threads/545770e9dad66c17cd0001d5/upvote # /courses/UnivX/123.4x/2T2015/discussion/threads/545770e9dad66c17cd0001d5/unvote # /courses/UnivX/123.4x/2T2015/discussion/threads/5447c22e892b213c7b0001f3/update # /courses/UnivX/123.4x/2T2015/discussion/threads/54493025892b2120a1000335/pin # /courses/UnivX/123.4x/2T2015/discussion/threads/54492e9c35c79cb03e00030c/delete # /courses/UnivX/123.4x/2T2015/discussion/forum/General/inline # /courses/UnivX/123.4x/2T2015/instructor/api/list_forum_members # /courses/UnivX/123.4x/2T2015/instructor/api/update_forum_role_membership # \"GET\": {\"action\": [\"allow\"], \"rolename\": [\"Administrator\"], \"unique_student_identifier\": [\"NEW_ADMIN_USER\"]}}"} # # module_id will be like: # "module_id": "UnivX/123.4x/forum/54492f0c892b21597e00030a" the_sql = """ SELECT time, username, '{course_id}' as course_id, (case when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/threads/[^/]+/reply') then "reply" when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/threads/[^/]+/upvote') then "upvote" when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/threads/[^/]+/unvote') then "unvote" when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/threads/[^/]+/update') then "update" when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/threads/[^/]+/delete') then "delete" when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/threads/[^/]+/close') then "close" when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/threads/[^/]+/follow') then "follow_thread" when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/threads/[^/]+/unfollow') then "unfollow_thread" when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/threads/[^/]+/pin') then "pin" when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/threads/[^/]+/unpin') then "unpin" when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/threads/[^/]+/downvote') then "downvote" # does this happen? when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/comments/[^/]+/reply') then "comment_reply" when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/comments/[^/]+/upvote') then "comment_upvote" when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/comments/[^/]+/update') then "comment_update" when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/comments/[^/]+/unvote') then "comment_unvote" when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/comments/[^/]+/delete') then "comment_delete" when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/forum/users/[^/]+/followed') then "follow_user" when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/forum/users/[^/]+$') then "target_user" when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/forum/[^/]+/threads/[^/]+') then "read" when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/forum/[^/]+/inline') then "read_inline" when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/forum/search') then "search" when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/forum$') then "enter_forum" when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/forum/$') then "enter_forum" when REGEXP_MATCH(event_type, r'/courses/(.*)/instructor/api/(.*)') then REGEXP_EXTRACT(event_type, r'/courses/.*/instructor/api/(.*)') when event_type = "edx.forum.thread.created" then "created_thread" when event_type = "edx.forum.response.created" then "created_response" when event_type = "edx.forum.comment.created" then "created_comment" when event_type = "edx.forum.searched" then "searched" else event_type end) as forum_action, (case when module_id is not null then REGEXP_EXTRACT(module_id, r'[^/]+/[^/]+/forum/([^/]+)') # For old-school courses with transparent course ids else (case when module_id is null # otherwise, for new opaque course ids, use regex to find thread_id from event_type, since module_id is null then (case when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/forum/[^/]+/threads/[^/]+') then REGEXP_EXTRACT(event_type, r'/courses/.*/discussion/forum/[^/]+/threads/([^/]+)') # read else (case when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/threads/[^/]+/') then REGEXP_EXTRACT(event_type, r'/courses/.*/discussion/threads/([^/]+)') # upvote, pinned, upvoted, unvoted, deleted, followed else REGEXP_EXTRACT(event_type, r'/courses/.*/discussion/comments/([^/]+)/') end) # comment end) end) end) as thread_id, REGEXP_EXTRACT(event_type, r'/courses/.*/forum/([^/]+)/') as subject, REGEXP_EXTRACT(event_type, r'/courses/.*/forum/users/([^/]+)') as target_user_id, event_struct.query as search_query, # unavailable before June 1, 2015 event_struct.GET as event_GET, # unavailable before June 1, 2015 FROM {DATASETS} WHERE (REGEXP_MATCH(event_type ,r'^edx\.forum\..*') or event_type contains "/discussion/forum" or event_type contains "/discussion/threads" or event_type contains "/discussion/comments" or event_type contains "list-forum-" or event_type contains "list_forum_" or event_type contains "add-forum-" or event_type contains "add_forum_" or event_type contains "remove-forum-" or event_type contains "remove_forum_" or event_type contains "update_forum_" ) AND username is not null AND event is not null and time > TIMESTAMP("{last_date}") {hash_limit} order by time """ try: tinfo = bqutil.get_bq_table_info(dataset, table ) assert tinfo is not None, "[make_forum_analysis] Creating %s.%s table for %s" % (dataset, table, course_id) print "[make_forum_analysis] Appending latest data to %s.%s table for %s" % (dataset, table, course_id) sys.stdout.flush() except (AssertionError, Exception) as err: print str(err) sys.stdout.flush() print " --> Missing %s.%s? Attempting to create..." % ( dataset, table ) sys.stdout.flush() pass print "=== Processing Forum Events for %s (start %s)" % (course_id, datetime.datetime.now()) sys.stdout.flush() def gdf(row): return datetime.datetime.utcfromtimestamp(float(row['time'])) process_tracking_logs.run_query_on_tracking_logs(the_sql, table, course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest, get_date_function=gdf, has_hash_limit=True, end_date=end_date, skip_last_day=skip_last_day ) print "Done with Forum Events for %s (end %s)" % (course_id, datetime.datetime.now()) print "="*77 sys.stdout.flush()
def process_course_time_on_task(course_id, force_recompute=False, use_dataset_latest=False, end_date=None, just_do_totals=False, limit_query_size=False, table_max_size_mb=800, skip_totals=False, start_date=None): ''' Create the time_on_task table, containing time, user_id, and time on task stats. This table isn't split into separate days. It is ordered in time, however. To update it, a new day's logs are processed, then the results appended to this table. If the table doesn't exist, then run it once on all the existing tracking logs. If it already exists, then run a query on it to see what dates have already been done. Then do all tracking logs except those which have already been done. Append the results to the existing table. Compute totals and store in time_on_task_totals, by summing over all dates, grouped by user. ''' if just_do_totals: return process_time_on_task_totals(course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest) SQL = """ SELECT "{course_id}" as course_id, date(time) as date, username, # total time spent on system SUM( case when dt < 5*60 then dt end ) as total_time_5, SUM( case when dt < 30*60 then dt end ) as total_time_30, # total time spent watching videos SUM( case when (dt_video is not null) and (dt_video < 5*60) then dt_video end ) as total_video_time_5, SUM( case when (dt_video is not null) and (dt_video < 30*60) then dt_video end ) as total_video_time_30, SUM( case when (serial_dt_video is not null) and (serial_dt_video < 30*60) then serial_dt_video end ) as serial_video_time_30, # total time spent doing problems SUM( case when (dt_problem is not null) and (dt_problem < 5*60) then dt_problem end ) as total_problem_time_5, SUM( case when (dt_problem is not null) and (dt_problem < 30*60) then dt_problem end ) as total_problem_time_30, SUM( case when (serial_dt_problem is not null) and (serial_dt_problem < 30*60) then serial_dt_problem end ) as serial_problem_time_30, # total time spent on forum SUM( case when (dt_forum is not null) and (dt_forum < 5*60) then dt_forum end ) as total_forum_time_5, SUM( case when (dt_forum is not null) and (dt_forum < 30*60) then dt_forum end ) as total_forum_time_30, SUM( case when (serial_dt_forum is not null) and (serial_dt_forum < 30*60) then serial_dt_forum end ) as serial_forum_time_30, # total time spent with textbook or wiki SUM( case when (dt_text is not null) and (dt_text < 5*60) then dt_text end ) as total_text_time_5, SUM( case when (dt_text is not null) and (dt_text < 30*60) then dt_text end ) as total_text_time_30, SUM( case when (serial_dt_text is not null) and (serial_dt_text < 30*60) then serial_dt_text end ) as serial_text_time_30, FROM ( SELECT time, username, (time - last_time)/1.0E6 as dt, # dt is in seconds case when is_video then (time - last_time_video)/1.0E6 end as dt_video, case when is_problem then (time - last_time_problem)/1.0E6 end as dt_problem, case when is_forum then (time - last_time_forum)/1.0E6 end as dt_forum, case when is_text then (time - last_time_text)/1.0E6 end as dt_text, case when is_video then (time - last_time_xevent)/1.0E6 end as serial_dt_video, case when is_problem then (time - last_time_xevent)/1.0E6 end as serial_dt_problem, case when is_forum then (time - last_time_xevent)/1.0E6 end as serial_dt_forum, case when is_text then (time - last_time_xevent)/1.0E6 end as serial_dt_text, FROM ( SELECT time, username, last_username, USEC_TO_TIMESTAMP(last_time) as last_time, (case when is_video then USEC_TO_TIMESTAMP(last_time_video) end) as last_time_video, # last_username_video, # last_event_video, is_problem, is_video, (case when is_problem then USEC_TO_TIMESTAMP(last_time_problem) end) as last_time_problem, # last_username_problem, # last_event_problem, is_forum, is_text, (case when is_forum then USEC_TO_TIMESTAMP(last_time_forum) end) as last_time_forum, (case when is_text then USEC_TO_TIMESTAMP(last_time_text) end) as last_time_text, is_xevent, (case when is_xevent then USEC_TO_TIMESTAMP(last_time_xevent) end) as last_time_xevent, FROM (SELECT time, username, lag(time, 1) over (partition by username order by time) last_time, lag(username, 1) over (partition by username order by time) last_username, is_video, is_problem, is_forum, is_text, (is_video or is_problem or is_forum or is_text) as is_xevent, # x = video, problem, forum, or text: any event case when is_problem then username else '' end as uname_problem, case when is_video then username else '' end as uname_video, case when is_forum then username else '' end as uname_forum, case when is_text then username else '' end as uname_text, case when (is_video or is_problem or is_forum or is_text) then username else '' end as uname_xevent, lag(time, 1) over (partition by uname_video order by time) last_time_video, # lag(event_type, 1) over (partition by uname_video order by time) last_event_video, # lag(uname_video, 1) over (partition by uname_video order by time) last_username_video, lag(time, 1) over (partition by uname_problem order by time) last_time_problem, # lag(event_type, 1) over (partition by uname_problem order by time) last_event_problem, # lag(uname_problem, 1) over (partition by uname_problem order by time) last_username_problem, lag(time, 1) over (partition by uname_forum order by time) last_time_forum, lag(time, 1) over (partition by uname_text order by time) last_time_text, lag(time, 1) over (partition by uname_xevent order by time) last_time_xevent, FROM (SELECT time, username, event_type, case when (REGEXP_MATCH(event_type ,r'\w+_video') or REGEXP_MATCH(event_type, r'\w+_transcript') ) then True else False end as is_video, case when REGEXP_MATCH(event_type, r'problem_\w+') then True else False end as is_problem, case when (REGEXP_MATCH(event_type ,r'^edx\.forum\..*') or event_type contains "/discussion/forum" or event_type contains "/discussion/threads" or event_type contains "/discussion/comments" ) then True else False end as is_forum, case when (REGEXP_MATCH(event_type ,r'^textbook\..*') or event_type contains "/wiki/" ) then True else False end as is_text, FROM {DATASETS} WHERE NOT event_type contains "/xblock/" AND username is not null AND username != "" and time > TIMESTAMP("{last_date}") {hash_limit} ) ) WHERE last_time is not null ORDER BY username, time ) ) group by course_id, username, date order by date, username; """ table = 'time_on_task' def gdf(row): return datetime.datetime.strptime(row['date'], '%Y-%m-%d') process_tracking_logs.run_query_on_tracking_logs(SQL, table, course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest, end_date=end_date, start_date=start_date, get_date_function=gdf, days_delta=0, has_hash_limit=True, newer_than=datetime.datetime(2015,3,15), # schema change table_max_size_mb=table_max_size_mb, limit_query_size=limit_query_size) if not skip_totals: return process_time_on_task_totals(course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest) return