def TransformResourceData(vars): fields = { 'resource_id': 'num', 'resource_type_id': 'num', 'resource_name': 'string', 'resource_uri': 'string', 'resource_parent_id': 'num', 'resource_child_number': 'num', } t = vars['target'] resource_inserter = db.StaggeredInsert(t['host'], t['user'], t['password'], t['port'], t['db'], 'resources', fields) resource_id_map = {} resources = [ {'type': 'indices', 'parent_type': None, 'items': vars['queries'].GetIndices(vars)}, {'type': 'content_sections', 'parent_type': None, 'items': vars['queries'].GetContentSections(vars)}, {'type': 'tutorials', 'parent_type': 'content_sections', 'items': vars['queries'].GetTutorials(vars)}, {'type': 'tests', 'parent_type': 'content_sections', 'items': vars['queries'].GetTests(vars)}, {'type': 'books', 'parent_type': None, 'items': vars['queries'].GetBooks(vars)}, {'type': 'wikis', 'parent_type': None, 'items': vars['queries'].GetWikis(vars)}, {'type': 'forums', 'parent_type': None, 'items': vars['queries'].GetForums(vars)}, ] resource_moocdb_id = 1 for resource_subset in resources: if resource_subset['type'] not in resource_id_map.keys(): resource_id_map[resource_subset['type']] = {} for item in resource_subset['items']: item['resource_id'] = resource_moocdb_id resource_id_map[resource_subset['type']][item['original_id']] = resource_moocdb_id resource_moocdb_id += 1 rpt = resource_subset['parent_type'] rpoid = item['resource_parent_original_id'] if rpt != None and rpoid != None and rpoid in resource_id_map[rpt].keys(): rpmid = resource_id_map[rpt][rpoid] item['resource_parent_id'] = rpmid else: item['resource_parent_id'] = None resource_inserter.addRow({k: item[k] for k in fields}) resource_inserter.insertPendingRows() vars["logger"].Log(vars, "Counts: Inserted {} resources to target".format(resource_inserter.num_inserted_rows)) return resource_id_map
def InsertAssessments(vars, assessments): fields = { 'assessment_id': 'num', 'submission_id': 'num', 'assessment_grade': 'num', 'assessment_max_grade': 'num', 'assessment_grader_id': 'num', 'assessment_timestamp': 'datetime', } t = vars['target'] assessment_inserter = db.StaggeredInsert(t['host'], t['user'], t['password'], t['port'], t['db'], 'assessments', fields) for assessment in assessments: assessment_inserter.addRow({k: assessment[k] for k in fields}) assessment_inserter.insertPendingRows()
def InsertSubmissions(vars, submissions): fields = { 'submission_id': 'num', 'user_id': 'num', 'problem_id': 'num', 'submission_timestamp': 'datetime', 'submission_answer': 'string', 'submission_attempt_number': 'num', 'submission_is_submitted': 'num', } t = vars['target'] submission_inserter = db.StaggeredInsert(t['host'], t['user'], t['password'], t['port'], t['db'], 'submissions', fields) for submission in submissions: submission_inserter.addRow({k: submission[k] for k in fields}) submission_inserter.insertPendingRows()
def InsertObservedEvents(vars, events): fields = { 'observed_event_type_id': 'num', 'user_id': 'num', 'item_id': 'num', 'observed_event_timestamp': 'datetime', 'observed_event_data': 'string', } t = vars['target'] target_db_selector = db.Selector(t['host'], t['user'], t['password'], t['port'], t['db']) oe_inserter = db.StaggeredInsert(t['host'], t['user'], t['password'], t['port'], t['db'], 'observed_events', fields) for event in events: oe_inserter.addRow({k: event[k] for k in fields}) oe_inserter.insertPendingRows()
def TransformTestData(vars): test_id_maps = {} # Problems # --------- fields = { 'problem_id': 'num', 'problem_name': 'string', 'problem_type_id': 'num', 'problem_parent_id': 'num', 'resource_id': 'num', 'problem_child_number': 'num', 'problem_release_timestamp': 'datetime', 'problem_soft_deadline': 'datetime', 'problem_hard_deadline': 'datetime', 'problem_max_submission': 'num', } t = vars['target'] problem_inserter = db.StaggeredInsert(t['host'], t['user'], t['password'], t['port'], t['db'], 'problems', fields) problem_id_map = {} problems = vars['queries'].GetProblems(vars) problem_index = 1 for problem in problems: problem['problem_id'] = problem_index problem_id_map[problem['problem_original_id']] = problem_index problem_index += 1 problem['problem_parent_id'] = None if problem['problem_parent_original_id'] != None: problem['problem_parent_id'] = vars['id_maps']['tests'][problem['problem_parent_original_id']] problem['resource_id'] = None if problem['resource_original_id'] != None: problem['resource_id'] = vars['id_maps']['tests'][problem['resource_original_id']] problem_inserter.addRow({k: problem[k] for k in fields}) problem_inserter.insertPendingRows() test_id_maps['problems'] = problem_id_map # Submissions, submission_events and assessments # ----------------------------------------------- submission_index = 1 assessment_index = 1 tests = vars['queries'].GetTests(vars) ti = 0 oetid = moocdb_utils.GetObservedEventTypeMap(vars)['problem_submission'] for test in tests: ti += 1 vars['logger'].Log(vars, "\tSubmissions, assessments, and observed events for test {} out of {}".format(ti, len(tests))) user_num_submissions = {} submission_assessment_data = vars['queries'].GetSubmissionAndAssessmentData(vars, test) submissions = [] assessments = [] observed_events = [] for submission in submission_assessment_data: user_original_id = submission['user_original_id'] user_id = vars['id_maps']['users'][user_original_id] if user_id not in user_num_submissions.keys(): user_num_submissions[user_id] = 1 submissions.append({ 'submission_id': submission_index, 'user_id': user_id, 'problem_id': problem_id_map[submission['problem_original_id']], 'submission_timestamp': submission['submission_timestamp'], 'submission_answer': submission['submission_answer'], 'submission_attempt_number': user_num_submissions[user_id], 'submission_is_submitted': 1, }) user_num_submissions[user_id] += 1 for assn in submission['assessments']: grader_id = 0 if assn['grader_original_id'] == 0 else vars['id_maps']['user'][assn['grader_original_id'].lower()] assessments.append({ 'assessment_id': assessment_index, 'submission_id': submission_index, 'assessment_grade': assn['grade'], 'assessment_max_grade': assn['max_grade'], 'assessment_grader_id': grader_id, 'assessment_timestamp': assn['assessment_timestamp'], }) assessment_index += 1 observed_events.append({ 'observed_event_type_id': oetid, 'user_id': user_id, 'item_id': problem_id_map[problem['problem_original_id']], 'observed_event_timestamp': submission['submission_timestamp'], 'observed_event_data': '{}', }) submission_index += 1 InsertSubmissions(vars, submissions) InsertAssessments(vars, assessments) InsertObservedEvents(vars, observed_events) return test_id_maps
def TransformCollaborationData(vars): collaboration_id_maps = {} fields = { 'collaboration_id': 'num', 'collaboration_type_id': 'num', 'user_id': 'num', 'resource_id': 'num', 'collaboration_content': 'string', 'collaboration_timestamp': 'datetime', 'collaboration_parent_id': 'num', 'collaboration_child_number': 'num', } t = vars['target'] collaboration_inserter = db.StaggeredInsert(t['host'], t['user'], t['password'], t['port'], t['db'], 'collaborations', fields) # Forum Posts ################ collaborations = [ { 'type': 'forum_posts', 'items': vars['queries'].GetForumPosts(vars), 'parent_type': 'forum_posts', 'resource_type': 'forums' }, { 'type': 'forum_votes', 'items': vars['queries'].GetForumVotes(vars), 'parent_type': 'forum_posts', 'resource_type': 'forums' }, { 'type': 'wiki_edits', 'items': vars['queries'].GetWikiEdits(vars), 'parent_type': None, 'resource_type': 'wikis' }, ] #print collaborations coll_index = 1 for coll_subset in collaborations: type = coll_subset['type'] parent_type = coll_subset['parent_type'] resource_type = coll_subset['resource_type'] collaboration_id_maps[type] = {} for item in coll_subset['items']: item['collaboration_id'] = coll_index collaboration_id_maps[type][item['original_id']] = coll_index coll_index += 1 add_item = True item['collaboration_parent_id'] = None cpoid = item['collaboration_parent_original_id'] if parent_type != None and cpoid != None: if cpoid in collaboration_id_maps[parent_type].keys(): item['collaboration_parent_id'] = collaboration_id_maps[ parent_type][cpoid] else: add_item = False item['resource_id'] = None roid = item['resource_original_id'] if resource_type != None and roid != None: if roid in vars['id_maps'][resource_type].keys(): item['resource_id'] = vars['id_maps'][resource_type][roid] else: add_item = False uoid = item['user_original_id'] if uoid in vars['id_maps']['users'].keys(): item['user_id'] = vars['id_maps']['users'][uoid] else: add_item = False if add_item: collaboration_inserter.addRow({k: item[k] for k in fields}) collaboration_inserter.insertPendingRows() vars["logger"].Log( vars, "Counts: Inserted {} collaborations to target".format( collaboration_inserter.num_inserted_rows)) return collaboration_id_maps
def TransformUserData(vars): # DB connections # -------------- c = vars['core'] core_db_selector = db.Selector(c['host'], c['user'], c['password'], c['port'], c['db']) # Populate the users table user_id_map = {} users = vars['queries'].GetUsers(vars) fields = { 'user_id': 'num', 'user_email': 'string', 'user_type_id': 'num', 'user_join_timestamp': 'datetime', 'user_ip': 'ip', 'user_country': 'string', 'user_timezone_offset': 'num', 'user_final_grade': 'num', } # IP-country lookup table ip_country_rows = [{ 'start': int(x['ip_numeric_start']), 'stop': int(x['ip_numeric_stop']), 'country_code': x['country_code'] } for x in core_db_selector.query( "SELECT ip_numeric_start,ip_numeric_stop,country_code FROM ip_country ORDER BY ip_numeric_start" )] t = vars['target'] user_inserter = db.StaggeredInsert(t['host'], t['user'], t['password'], t['port'], t['db'], 'users', fields) moocdb_user_id = 1 for user in users: # User MOOCdb ID user['user_id'] = moocdb_user_id # User IP user['user_ip'] = db.ip_aton(user['user_ip']) # User email cannot be null if 'user_email' not in user.keys() or user['user_email'] == None: user['user_email'] = '' # User country if 'user_country' not in user.keys(): user['user_country'] = None if user['user_country'] == None and user[ 'user_ip'] != 'null': # Note: Some platforms don't record IP, but do record country for ipc_row in ip_country_rows: if user['user_ip'] >= ipc_row['start'] and user[ 'user_ip'] <= ipc_row['stop']: user['user_country'] = ipc_row['country_code'] break # User timezone offset # We are computing it as the mean for the country since some platforms provide incorrect data for user timezone utzo = None if user['user_country'] != None: r = core_db_selector.query( "SELECT * FROM timezone WHERE country_code='{}'".format( user['user_country'])) if len(r) > 0: offsets = [x['gmt_offset'] for x in r] utzo = offsets[len(offsets) / 2] user['user_timezone_offset'] = utzo user_inserter.addRow( {k: user[k] if k in user.keys() else None for k in fields}) user_id_map[user['original_id']] = moocdb_user_id moocdb_user_id += 1 user_inserter.insertPendingRows() vars["logger"].Log( vars, "Counts: Inserted {} users to target".format( user_inserter.num_inserted_rows)) return user_id_map