''' This module calculates the number of forum threads and posts for a given course stored in the MongoDB database Usage: python <path_to_script> ''' from base_edx import EdXConnection connection = EdXConnection('forum' ) collection = connection.get_access_to_collection() # Number of documents with _type CommentThread # A CommentThread represents the first level of interaction: a post that opens #a new thread, often a student question of some sort number_of_comment_threads = collection['forum'].find({'_type' : 'CommentThread'}).count() # Total number of comments in the forum number_of_posts = collection['forum'].find().count() print number_of_posts, number_of_comment_threads
python create_problem_ids_collection.py Note: Make sure the module base_edx is in the same directory as this script so that it can be imported ''' from base_edx import EdXConnection # The second argument in line 27 is the name of the new collection which will # contain the results of this script. Each new document will be inserted into # this new collection. The name of the resulting collection could be anything; # preferrably relevant to the course connection = EdXConnection('tracking_atoc185x', 'atoc185x_problem_ids') collection = connection.get_access_to_collection() cursor = collection['tracking_atoc185x'].find({ 'event_type': 'problem_check', 'event_source': 'server' }) for index, document in enumerate(cursor): doc_result = {} doc_result['username'] = document['username'] doc_result['problem_id'] = document['event']['problem_id'] doc_result['course_id'] = document['context']['course_id'] doc_result['module'] = document['context']['module'] doc_result['time'] = document['time'] doc_result['event'] = document['event'] collection['atoc185x_problem_ids'].insert(doc_result)
Usage: python ip_to_country.py <db_name> ''' import geoip import csv from collections import Counter, defaultdict import sys from base_edx import EdXConnection from generate_csv_report import CSV db_name = sys.argv[1] # Change name of collection as required connection = EdXConnection(db_name, 'tracking') collection = connection.get_access_to_collection() # The csv file country_code_to_country.csv was retrieved from http://dev.maxmind.com/geoip/legacy/geolite/ # This was used to get a mapping of a country code to a country with open('country_code_to_country.csv') as f: reader = csv.reader(f) country_code_to_country = dict(reader) cursor = collection['tracking'].find() result = {} for index, item in enumerate(cursor): if item['username'] not in result: result[item['username']] = {item['ip']} else: result[item['username']].add(item['ip'])
python entrance_exit_surveys.py ''' from collections import defaultdict import json import sys # These modules can be found under reporting scripts. You will have to add them # in the same directory as this script from base_edx import EdXConnection from generate_csv_report import CSV db_name = sys.argv[1] connection = EdXConnection( db_name, 'courseware_studentmodule', 'auth_user') # EdXConnection('courseware_studentmodule', 'auth_user') collection = connection.get_access_to_collection() # Modify key-value pairs in survey_pages to the name of the survey pages and to # the problem ids that maps to the survey pages E.g. if a course have a survey_pages = { 'entrance_survey': { 'general_info': 'i4x://McGillX/CHEM181x/problem/c9d2efffbdf043e68789bd60cd4954e3', 'demographics_background': 'i4x://McGillX/CHEM181x/problem/134cfc9efb2b400bab2ee1505cc9e4a9', 'aspirations_motivation': 'i4x://McGillX/CHEM181x/problem/579ae070227c4f5c973eb02affdcba2a' },
''' This module counts the number of navigation events: seq_next, seq_prev, seq_goto for those students who completed the course ''' import csv from base_edx import EdXConnection from generate_csv_report import CSV connection = EdXConnection('tracking_atoc185x') collection = connection.get_access_to_collection() with open('csv_files/McGillX_CHEM181x_1T2014_grade_report_2014-04-24-1030.csv', 'r') as csv_file: reader = csv.reader(csv_file) usernames = [row[2] for row in reader] cursor = collection['tracking_atoc185x'].aggregate([{"$match" : {"event_source" : "browser", "$or" : [{"event_type" : "seq_prev"},{"event_type" : "seq_goto"},{"event_type" : "seq_next"}], 'username' : {'$in' : usernames}}}, {"$group" : {"_id" : {'chapter_name' : "$parent_data.chapter_display_name", "display_name" : "$metadata.display_name", "event_type" : "$event_type", "event_old" : "$event.old", "event_new" : "$event.new"}, "count" : {"$sum" : 1}}}]) #with open('csv_files/navigation_frequency_completers.csv', 'w') as csv_file: # writer = csv.writer(csv_file) # writer.writerow(['Chapter Name', 'Display Name', 'Event Type', 'Event Old', 'Event New', 'Count']) # for item in cursor['result']: # try: # writer.writerow([item['_id']['chapter_name'], item['_id']['display_name'], item['_id']['event_type'], item['_id'].get('event_old', 0), item['_id']['event_new'], item['count']]) # except: # pass result = [] for item in cursor['result']:
to their hash ids and return a new csv_report Usage: python user_id_to_hash_id_reports.py db_name csv_report ''' import sys import csv from base_edx import EdXConnection from generate_csv_report import CSV db_name = sys.argv[1] # Change name of collection as required connection = EdXConnection(db_name, 'user_id_map' ) collection = connection.get_access_to_collection() with open(sys.argv[2]) as f: headers = next(f) reader = csv.reader(f) data = [row for row in reader] result = [] for row in data: cursor = collection['user_id_map'].find_one({'id' : long(row[0])}) hash_id = cursor['hash_id'] username = cursor['username'] result.append([row[0], username, hash_id] + row[1:]) input_file, extension = sys.argv[2].split('.')
''' This module determines how many chapters were accessed by each user for a given course Usage: python chapters_accessed_per_user ''' from collections import defaultdict from base_edx import EdXConnection from generate_csv_report import CSV connection = EdXConnection('tracking', 'course_structure') collection = connection.get_access_to_collection() # Get all chapters chapters = collection['course_structure'].distinct( 'parent_data.chapter_display_name') tracking = collection['tracking'].find() result = [] for document in tracking: if 'parent_data' in document: pass output = CSV(result, ['Username'].extend(chapters), output_file='atoc185x_chapters_accesses_per_user.csv') output.generate_csv()
from generate_csv_report import CSV # If you have access to the grade report provided by edX, you can use the following # 7 lines of code to get all usernames with grades between 50% and 59% inclusive #with open('csv_files/grades_report.csv') as f: # reader = csv.reader(f) # header = reader.next() # usernames = [row[2] for row in reader if '0.5' <= row[3] <= '0.59'] #connection = EdXConnection('tracking') #collection = connection.get_access_to_collection() #cursor = collection['tracking'].aggregate([{'$match' : {'username' : {'$in' : usernames}, '$or': [{'event_type' : 'play_video'},{'event_type' : 'problem_check', 'event_source' : 'server'}]}},{'$group' : { '_id' : { "username" : "$username", "chapter_name" : "$parent_data.chapter_display_name" ,"sequential_name" : "$parent_data.sequential_display_name","vertical_name" : "$parent_data.vertical_display_name"}}}, {'$out' : 'students_50_to_59_events'}]) # Else you can extract the names of the students with grades betweeb 50% and 59% # inclusive from the collection certificates_generatedcertificate from the # following lines of code connection = EdXConnection('tracking_atoc185x', 'auth_user', 'certificates_generatedcertificate') collection = connection.get_access_to_collection() # Get all user ids of students with grades between 50% and 59% inclusive from # the collection certificates_generatedcertificate user_ids = { document['user_id'] for document in collection['certificates_generatedcertificate'].find( {'$and': [{ 'grade': { '$gte': 0.5 } }, { 'grade': { '$lte': 0.59 }
''' This module gets the date of registration of all users who completed the course Usage: python date_of_registration_completers.py ''' import csv from datetime import datetime from base_edx import EdXConnection from generate_csv_report import CSV connection = EdXConnection('student_courseenrollment') collection = connection.get_access_to_collection() # Can replace csv file with any csv file that contains the list of usernames # who completed the course and achieved a certificate. Alternately, one can # save that info in another collection in mongoDB and extra it from the collection with open('atoc185x/course_completers.csv') as csv_file: reader = csv.reader(csv_file) reader.next() users = {row[0]: row[1] for row in reader} result = [] student_courseenrollment = collection['student_courseenrollment'].find() seen = set() for document in student_courseenrollment: if str(document['user_id']) in users and document['user_id'] not in seen:
''' This module gets the number of navigation events for each user while they are taking Test 1 ''' from collections import defaultdict import time from datetime import datetime from generate_csv_report import CSV from base_edx import EdXConnection connection = EdXConnection('format_tests', 'tracking') collection = connection.get_access_to_collection() cursor = collection['format_tests'].find({'parent_data.chapter_display_name' : 'Test 1'}) users_sessions = defaultdict(list) for index,item in enumerate(cursor): #print index, item['parent_data']['chapter_display_name'] users_sessions[(item['username'], item['session'])].append(item['time']) users_tests_events = defaultdict(int) for (username,session),times in users_sessions.iteritems(): end_time = datetime.strptime(max(times).split('+')[0], "%Y-%m-%dT%H:%M:%S.%f") start_time = datetime.strptime(min(times).split('+')[0], "%Y-%m-%dT%H:%M:%S.%f") cursor = collection['tracking'].find({'username' : username, 'session' : session, '$or' : [{'event_type' : 'seq_goto'},{'event_type':'seq_prev'},{'event_type' : 'seq_next'}]}) for index, document in enumerate(cursor): try: time_stamp = datetime.strptime(document['time'].split('+')[0], "%Y-%m-%dT%H:%M:%S.%f") if start_time <= time_stamp <= end_time: if 'sequential_display_name' in document['parent_data'] and document['parent_data']['sequential_display_name']: sequential_display_name = document['parent_data']['sequential_display_name'] elif document['metadata']['display_name']:
''' This module gets all the events per user while watching videos. Since we will need to sort a very large number of documents, user should create a separate collection to aggregate all required documents in one collection and then extract results from the new collection Command to run on the mongo shell to creare new collection: db.tracking_atoc185x.aggregate([{$match : {$and : [{"event_type" : "seek_video"},{ "parent_data": { $exists: true } }]}}, {$sort : {"parent_data.chapter_display_name" : 1, "parent_data.sequential_display_name" : 1, "parent_data.vertical_display_name" : 1}}, {$out : "seek_video"}, {allowDiskUse : true}]) ''' from base_edx import EdXConnection from generate_csv_report import CSV connection = EdXConnection('seek_video') collection = connection.get_access_to_collection() sort_parameters = [('parent_data.chapter_display_name', 1), ('parent_data.sequential_display_name', 1), ('parent_data.vertical_display_name', 1)] cursor = collection['seek_video'].find() result = [] for index, item in enumerate(cursor): if 'old_time' in item['event']: old_time = item['event']['old_time'] else: old_time = 0 result.append([ item['username'], item['parent_data']['chapter_display_name'], item['parent_data']['sequential_display_name'], item['parent_data']['vertical_display_name'], old_time, item['event']['new_time']
''' This module retrieve the first activity of all user who completed a course The list of users who has completed the course was provided in a given csv file, McGillX_CHEM181x_1T2014_grade_report_2014-04-24-1030.csv. When these users are extracted, we then check the database for all of these users and detect their first activity. First activity is defined by the event_type '/courses/McGillX/CHEM181x/1T2014/info' ''' import csv from datetime import datetime from collections import defaultdict from base_edx import EdXConnection # Connect to MongoDB and extra the tracking collection connection = EdXConnection('tracking', 'tracking_before_jan22') collection = connection.get_access_to_collection() # Retrieve users who has completed the course. This could be done anyway depending on what is provided with open('csv_files/McGillX_CHEM181x_1T2014_grade_report_2014-04-24-1030.csv', 'r') as csv_file: reader = csv.reader(csv_file) #usernames = [row[2] for row in reader] usernames = [row[2] for row in reader] # Retrieve the time of the first activity of all users who completed the course time_events = defaultdict(list) cursor = collection['tracking'].find( {'event_type': '/courses/McGillX/CHEM181x/1T2014/info'}) #cursor_before_jan_22 = collection['tracking_before_jan22'].find({'event_type' : '/courses/McGillX/CHEM181x/1T2014/info'}) with open('csv_files/first_activity_completers.csv', 'w') as csv_file:
username, video associated with load_video event, parent_data: {chapter_display_name, sequential_display_name, vertical_display_name,}, edx_video_id, video watch segments get the event_types : load_video, play_video, pause_video, seek_video sort by "time": "" so that the events are chronologically ordered for each load_video new video watch segment should include ONLY: - time between play_video -> next video event in time (pause_video or seek_video) - time between seek_video : {'new_time' : Time} -> pause video (only with new_time > old_time, this is to avoid including rewinds) watch periods: event_type : pause_video - "event_type":"play_video" {"event":{"currentTime":TIME}} = new video watch segment if seek_video : {'old_time' : TIME} < seek_video : {'new_time' : TIME} "pause_video" {"event":{"currentTime":TIME}} - seek_video : {'new_time': TIME } = new video watch segment if seek_video : {'old_time' : TIME} > seek_video : {'new_time' : TIME} = rewind (exclude from watch segments) ''' from base_edx import EdXConnection from generate_csv_report import CSV connection = EdXConnection('video_watch_duration_collection') collection = connection.get_access_to_collection() cursor = collection['video_watch_duration_collection'].find() result = [] output = CSV(result,['Username',], output_file='video_watch_duration.csv', row_limit=200000) output.generate_csv()
import csv import sys import json # These modules can be found under reporting scripts. You will have to add them # in the same directory as this script from base_edx import EdXConnection from generate_csv_report import CSV db_name = sys.argv[1] answer_distribution = sys.argv[2] connection = EdXConnection(db_name, 'tracking' ) collection = connection.get_access_to_collection() with open(answer_distribution) as f: csv_f = csv.reader(f) headers = csv_f.next() rows = [row for row in csv_f] answer_keys = dict() for row in rows: problem_id = row[1] value = row[4] answer_value = row[5] if value: value = value.replace('[', '').replace(']', '').split('|') answer_value = answer_value.replace('[', '').replace(']', '').split('|') dict_value_answer = dict(zip(value, answer_value)) try: answer_keys[problem_id].update(dict_value_answer)
db.tracking_atoc185x.aggregate([{$match : {event_type : 'problem_check', 'event_source': 'server'}}, {$group : {_id : {"username" : "$username", "problem_id" : "$event.problem_id"}, attempts : {$push : "$event.success"}}}, {$out : "user_attempts_per_problem_id"}]) Then run this script on the above collection Usage: python activities_with_lower_completion.py ''' from collections import defaultdict from base_edx import EdXConnection from generate_csv_report import CSV # Connect to MongoDB and extra the tracking collection connection = EdXConnection('user_attempts_per_problem_id') collection = connection.get_access_to_collection() cursor = collection['user_attempts_per_problem_id'].find() result = defaultdict(lambda: defaultdict(int)) for index, document in enumerate(cursor): # If there is a correct attempts, accept as answered correctly, else accept #as incorrect only once per student per problem id if 'correct' in document['attempts']: result[document['_id']['problem_id']]['correct'] += 1 else: result[document['_id']['problem_id']]['incorrect'] += 1 csv_result = [[item, result[item]['correct'], result[item]['incorrect']] for item in result] output = CSV(csv_result, ['Problem Id', 'Correct Count', 'Incorrect Count'],
''' This module will retrieve info about students registered in the course Usage: python user_info.py ''' from base_edx import EdXConnection from generate_csv_report import CSV connection = EdXConnection('certificates_generatedcertificate', 'auth_userprofile') collection = connection.get_access_to_collection() documents = collection['auth_userprofile'].find() result = [] for document in documents: user_id = document['user_id'] try: final_grade = collection['certificates_generatedcertificate'].find_one( {'user_id': user_id})['grade'] result.append([ user_id, document['name'], final_grade, document['gender'], document['year_of_birth'], document['level_of_education'], document['country'], document['city'] ]) except: # Handle users with no grades pass
Since we will need to sort a very large number of documents, you should create a separate collection to aggregate all required documents in one collection and then extract results from the new collection. Command to run on the mongo shell to create new collection: db.tracking_atoc185x.aggregate([{$match : {$and : [{"event_type" : "speed_change_video"},{ "parent_data": { $exists: true } }]}}, {$sort : {"parent_data.chapter_display_name" : 1, "parent_data.sequential_display_name" : 1, "parent_data.vertical_display_name" : 1}}, {$out : "speed_change_video_data"}], {allowDiskUse : true}) Usage: python speed_change_video.py ''' from base_edx import EdXConnection from generate_csv_report import CSV connection = EdXConnection('speed_change_video_data') collection = connection.get_access_to_collection() cursor = collection['speed_change_video_data'].find() result = [[ item['username'], item['parent_data']['chapter_display_name'], item['parent_data']['sequential_display_name'], item['parent_data']['vertical_display_name'], item['event']['old_speed'], item['event']['new_speed'] ] for item in cursor] output = CSV(result, [ 'Username', 'Chapter Name', 'Sequential Name', 'Vertical Name', 'Old Speed', 'New Speed' ], output_file='speed_change.csv') output.generate_csv()