def get_course_id(event, from_url=False): """Gets course_id from event's data.""" # Get the event data: event_context = event.get('context') if event_context is None: # Assume it's old, and not worth logging... return None # Get the course_id from the data, and validate. course_id = event_context.get('course_id', '') if course_id: if opaque_key_util.is_valid_course_id(course_id): return course_id else: log.error("encountered event with bogus course_id: %s", event) return None # Try to get the course_id from the URLs in `event_type` (for implicit # server events) and `page` (for browser events). if from_url: source = event.get('event_source') if source == 'server': url = event.get('event_type', '') elif source == 'browser': url = event.get('page', '') else: url = '' course_key = opaque_key_util.get_course_key_from_url(url) if course_key: return unicode(course_key) return None
def mapper(self, line): value = self.get_event_and_date_string(line) if value is None: return event, _date_string = value event_type = event.get('event_type') if event_type is None: log.error("encountered event with no event_type: %s", event) return if event_type not in (DEACTIVATED, ACTIVATED): return timestamp = eventlog.get_event_time_string(event) if timestamp is None: log.error("encountered event with bad timestamp: %s", event) return event_data = eventlog.get_event_data(event) if event_data is None: return course_id = event_data.get('course_id') if course_id is None or not opaque_key_util.is_valid_course_id(course_id): log.error("encountered explicit enrollment event with invalid course_id: %s", event) return user_id = event_data.get('user_id') if user_id is None: log.error("encountered explicit enrollment event with no user_id: %s", event) return yield (course_id, user_id), (timestamp, event_type)
def mapper(self, line): value = self.get_event_and_date_string(line) if value is None: return event, _date_string = value event_type = event.get('event_type') if event_type is None: log.error("encountered event with no event_type: %s", event) return if event_type not in (DEACTIVATED, ACTIVATED, MODE_CHANGED, VALIDATED): return timestamp = eventlog.get_event_time_string(event) if timestamp is None: log.error("encountered event with bad timestamp: %s", event) return event_data = eventlog.get_event_data(event) if event_data is None: return course_id = opaque_key_util.normalize_course_id( event_data.get('course_id')) if course_id is None or not opaque_key_util.is_valid_course_id( course_id): log.error( "encountered explicit enrollment event with invalid course_id: %s", event) return user_id = event_data.get('user_id') if user_id is None: log.error( "encountered explicit enrollment event with no user_id: %s", event) return mode = event_data.get('mode') if mode is None: log.error("encountered explicit enrollment event with no mode: %s", event) return # Pull in extra properties provided only by synthetic enrollment validation events. validation_info = None if 'dump_start' in event_data: validation_info = { 'is_active': event_data.get('is_active'), 'created': event_data.get('created'), 'dump_start': event_data.get('dump_start'), 'dump_end': event_data.get('dump_end'), } # Make sure key values that are strings are properly encoded. # Note, however, that user_id is an int. key = (unicode(course_id).encode('utf-8'), user_id) yield key, (timestamp, event_type, mode, validation_info)
def mapper(self, line): # We only want to consider lines that include the type of event with which we are concerned. if LINK_CLICKED not in line: return value = self.get_event_and_date_string(line) if value is None: return event, date_string = value event_type = event.get('event_type') if not event_type: log.error("encountered event with no event_type: %s", event) return if event_type != LINK_CLICKED: return event_data = eventlog.get_event_data(event) if event_data is None: log.error("encountered explicit link_clicked event with no event data: %s", event) return context = event.get('context') if not context: log.error("encountered explicit link_clicked event with no context: %s", event) return course_id = context.get('course_id') if course_id is None or not opaque_key_util.is_valid_course_id(course_id): log.error("encountered explicit link_clicked event with invalid course_id: %s", event) return target_url = event_data.get('target_url') if not target_url: log.error("encountered explicit link_clicked event with no target_url: %s", event) return current_url = event_data.get('current_url') if not current_url: log.error("encountered explicit link_clicked event with no current_url: %s", event) return # A link is considered "internal" when it does not navigate away from the current host. # Some internal links exclude the host name entirely- they start with / so we account for that. current_loc = urlparse(current_url).netloc target_loc = urlparse(target_url).netloc is_external = 0 if current_loc != target_loc and target_loc != "": is_external = 1 yield (course_id, date_string), (is_external)
def mapper(self, line): value = self.get_event_and_date_string(line) if value is None: return event, _date_string = value event_type = event.get('event_type') if event_type is None: log.error("encountered event with no event_type: %s", event) return if event_type not in (DEACTIVATED, ACTIVATED, MODE_CHANGED, VALIDATED): return timestamp = eventlog.get_event_time_string(event) if timestamp is None: log.error("encountered event with bad timestamp: %s", event) return event_data = eventlog.get_event_data(event) if event_data is None: return course_id = event_data.get('course_id') if course_id is None or not opaque_key_util.is_valid_course_id(course_id): log.error("encountered explicit enrollment event with invalid course_id: %s", event) return user_id = event_data.get('user_id') if user_id is None: log.error("encountered explicit enrollment event with no user_id: %s", event) return mode = event_data.get('mode') if mode is None: log.error("encountered explicit enrollment event with no mode: %s", event) return # Pull in extra properties provided only by synthetic enrollment validation events. validation_info = None if 'dump_start' in event_data: validation_info = { 'is_active': event_data.get('is_active'), 'created': event_data.get('created'), 'dump_start': event_data.get('dump_start'), 'dump_end': event_data.get('dump_end'), } # Make sure key values that are strings are properly encoded. # Note, however, that user_id is an int. key = (unicode(course_id).encode('utf-8'), user_id) yield key, (timestamp, event_type, mode, validation_info)
def mapper(self, line): value = self.get_event_and_date_string(line) if value is None: return event, date_string = value event_type = event.get('event_type') if event_type is None: log.error("encountered event with no event_type: %s", event) return if event_type not in (DEACTIVATED, ACTIVATED, MODE_CHANGED): return timestamp = eventlog.get_event_time_string(event) if timestamp is None: log.error("encountered event with bad timestamp: %s", event) return event_data = eventlog.get_event_data(event) if event_data is None: return course_id = opaque_key_util.normalize_course_id( event_data.get('course_id')) if course_id is None or not opaque_key_util.is_valid_course_id( course_id): log.error( "encountered explicit enrollment event with invalid course_id: %s", event) return user_id = event_data.get('user_id') if user_id is None: log.error( "encountered explicit enrollment event with no user_id: %s", event) return mode = event_data.get('mode') if mode is None: log.error("encountered explicit enrollment event with no mode: %s", event) return yield date_string, (course_id, user_id, timestamp, event_type, mode)
def run(self): self.remove_output_on_overwrite() with self.input().open('r') as input_file: course_structure = json.load(input_file) with self.output().open('w') as output_file: courses_list = course_structure.get('results') if not courses_list: # If there are no courses, or 'results' is not a key in the json, output nothing. return for course in courses_list: # To maintain robustness, ignore any non-dictionary data that finds its way into the API response. try: start_string = course.get('start') end_string = course.get('end') if start_string is None: cleaned_start_string = '\N' else: cleaned_start_string = ciso8601.parse_datetime( start_string) if end_string is None: cleaned_end_string = '\N' else: cleaned_end_string = ciso8601.parse_datetime( end_string) course_id = course.get('id', '\N') if is_valid_course_id(course_id): course_key = CourseKey.from_string(course_id) course_run = course_key.run else: course_run = '\N' line = [ course_id, course.get('org', '\N'), course.get('number', '\N'), course_run, coerce_timestamp_for_hive(cleaned_start_string), coerce_timestamp_for_hive(cleaned_end_string), course.get('name', '\N') ] output_file.write('\t'.join( [v.encode('utf-8') for v in line])) output_file.write('\n') except AttributeError: # If the course is not a dictionary, move on to the next one. continue
def get_course_id(event): """Gets course_id from event's data.""" # Get the event data: event_context = event.get('context') if event_context is None: # Assume it's old, and not worth logging... return None # Get the course_id from the data, and validate. course_id = event_context.get('course_id', '') if not course_id: return None if not opaque_key_util.is_valid_course_id(course_id): log.error("encountered event with bogus course_id: %s", event) return None return course_id
def run(self): self.remove_output_on_overwrite() with self.input().open('r') as input_file: course_structure = json.load(input_file) with self.output().open('w') as output_file: courses_list = course_structure.get('results') if not courses_list: # If there are no courses, or 'results' is not a key in the json, output nothing. return for course in courses_list: # To maintain robustness, ignore any non-dictionary data that finds its way into the API response. try: start_string = course.get('start') end_string = course.get('end') if start_string is None: cleaned_start_string = '\N' else: cleaned_start_string = ciso8601.parse_datetime(start_string) if end_string is None: cleaned_end_string = '\N' else: cleaned_end_string = ciso8601.parse_datetime(end_string) course_id = course.get('id', '\N') if is_valid_course_id(course_id): course_key = CourseKey.from_string(course_id) course_run = course_key.run else: course_run = '\N' line = [ course_id, course.get('org', '\N'), course.get('number', '\N'), course_run, coerce_timestamp_for_hive(cleaned_start_string), coerce_timestamp_for_hive(cleaned_end_string), course.get('name', '\N') ] output_file.write('\t'.join([v.encode('utf-8') for v in line])) output_file.write('\n') except AttributeError: # If the course is not a dictionary, move on to the next one. continue
def get_explicit_enrollment_output(line): """ Generates output values for explicit enrollment events. Args: line: text line from a tracking event log. Returns: (course_id, user_id), (timestamp, action_value) where action_value = 1 (enrolled) or -1 (unenrolled) and timestamp is in ISO format, with resolution to the millisecond. or None if there is no valid enrollment event on the line. Example: (edX/DemoX/Demo_Course, dummy_userid), (2013-09-10T00:01:05.123456, 1) """ # Before parsing, check that the line contains something that # suggests it's an enrollment event. if 'edx.course.enrollment' not in line: return None # try to parse the line into a dict: event = eventlog.parse_json_event(line) if event is None: # The line didn't parse. For this specific purpose, # we can assume that all enrollment-related lines would parse, # and these non-parsing lines would get skipped anyway. return None # get event type, and check that it exists: event_type = event.get('event_type') if event_type is None: log.error("encountered event with no event_type: %s", event) return None # convert the type to a value: if event_type == 'edx.course.enrollment.activated': action_value = ENROLLED elif event_type == 'edx.course.enrollment.deactivated': action_value = UNENROLLED else: # not an enrollment event... return None # get the timestamp: datetime = eventlog.get_event_time(event) if datetime is None: log.error("encountered event with bad datetime: %s", event) return None timestamp = eventlog.datetime_to_timestamp(datetime) # Use the `user_id` from the event `data` field, since the # `user_id` in the `context` field is the user who made the # request but not necessarily the one who got enrolled. (The # `course_id` should be the same in `context` as in `data`.) # Get the event data: event_data = eventlog.get_event_data(event) if event_data is None: # Assume it's already logged (and with more specifics). return None # Get the course_id from the data, and validate. course_id = event_data['course_id'] if not opaque_key_util.is_valid_course_id(course_id): log.error("encountered explicit enrollment event with bogus course_id: %s", event) return None # Get the user_id from the data: user_id = event_data.get('user_id') if user_id is None: log.error("encountered explicit enrollment event with no user_id: %s", event) return None # For now, ignore the enrollment 'mode' (e.g. 'honor'). return (course_id, user_id), (timestamp, action_value)
def get_explicit_enrollment_output(line): """ Generates output values for explicit enrollment events. Args: line: text line from a tracking event log. Returns: (course_id, user_id), (timestamp, action_value) where action_value = 1 (enrolled) or -1 (unenrolled) and timestamp is in ISO format, with resolution to the millisecond. or None if there is no valid enrollment event on the line. Example: (edX/DemoX/Demo_Course, dummy_userid), (2013-09-10T00:01:05.123456, 1) """ # Before parsing, check that the line contains something that # suggests it's an enrollment event. if 'edx.course.enrollment' not in line: return None # try to parse the line into a dict: event = eventlog.parse_json_event(line) if event is None: # The line didn't parse. For this specific purpose, # we can assume that all enrollment-related lines would parse, # and these non-parsing lines would get skipped anyway. return None # get event type, and check that it exists: event_type = event.get('event_type') if event_type is None: log.error("encountered event with no event_type: %s", event) return None # convert the type to a value: if event_type == 'edx.course.enrollment.activated': action_value = ENROLLED elif event_type == 'edx.course.enrollment.deactivated': action_value = UNENROLLED else: # not an enrollment event... return None # get the timestamp: datetime = eventlog.get_event_time(event) if datetime is None: log.error("encountered event with bad datetime: %s", event) return None timestamp = eventlog.datetime_to_timestamp(datetime) # Use the `user_id` from the event `data` field, since the # `user_id` in the `context` field is the user who made the # request but not necessarily the one who got enrolled. (The # `course_id` should be the same in `context` as in `data`.) # Get the event data: event_data = eventlog.get_event_data(event) if event_data is None: # Assume it's already logged (and with more specifics). return None # Get the course_id from the data, and validate. course_id = event_data['course_id'] if not opaque_key_util.is_valid_course_id(course_id): log.error( "encountered explicit enrollment event with bogus course_id: %s", event) return None # Get the user_id from the data: user_id = event_data.get('user_id') if user_id is None: log.error("encountered explicit enrollment event with no user_id: %s", event) return None # For now, ignore the enrollment 'mode' (e.g. 'honor'). return (course_id, user_id), (timestamp, action_value)
def test_just_newline_course_id(self): self.assertFalse(opaque_key_util.is_valid_course_id('\n'))
def test_normal_legacy_course_id(self): self.assertTrue(opaque_key_util.is_valid_course_id(VALID_LEGACY_COURSE_ID))
def test_legacy_course_id_without_components(self): self.assertFalse(opaque_key_util.is_valid_course_id(INVALID_LEGACY_COURSE_ID))
def get_problem_check_event(line): """ Generates output values for explicit problem_check events. Args: line: text line from a tracking event log. Returns: (problem_id, username), (timestamp, problem_check_info) where timestamp is in ISO format, with resolution to the millisecond and problem_check_info is a JSON-serialized dict containing the contents of the problem_check event's 'event' field, augmented with entries for 'timestamp', 'username', and 'context' from the event. or None if there is no valid problem_check event on the line. Example: (i4x://edX/DemoX/Demo_Course/problem/PS1_P1, dummy_username), (2013-09-10T00:01:05.123456, blah) """ # Parse the line into a dict. event = eventlog.parse_json_server_event(line, 'problem_check') if event is None: return None # Get the "problem data". This is the event data, the context, and anything else that would # be useful further downstream. (We could just pass the entire event dict?) # Get the user from the username, not from the user_id in the # context. While we are currently requiring context (as described # above), we might not in future. Older events will not have # context information, so we can't rely on user_id from there. # And we don't expect problem_check events to occur without a # username, and don't expect them to occur with the wrong user # (i.e. one user acting on behalf of another, as in an instructor # acting on behalf of a student). augmented_data_fields = ['context', 'username', 'timestamp'] problem_data = eventlog.get_augmented_event_data(event, augmented_data_fields) if problem_data is None: return None # Get the course_id from context. We won't work with older events # that do not have context information, since they do not directly # provide course_id information. (The problem_id/answer_id values # contain the org and course name, but not the run.) Course_id # information could be found from other events, but it would # require expanding the events being selected. course_id = problem_data.get('context').get('course_id') if course_id is None: log.error("encountered explicit problem_check event with missing course_id: %s", event) return None if not opaque_key_util.is_valid_course_id(course_id): log.error("encountered explicit problem_check event with bogus course_id: %s", event) return None # Get the problem_id from the event data. problem_id = problem_data.get('problem_id') if problem_id is None: log.error("encountered explicit problem_check event with bogus problem_id: %s", event) return None if len(event.get('event', {}).get('answers', [])) == 0: return None problem_data_json = json.dumps(problem_data) key = (course_id, problem_id, problem_data.get('username')) value = (problem_data.get('timestamp'), problem_data_json) return key, value
def test_course_id_with_valid_nonascii(self): self.assertTrue(opaque_key_util.is_valid_course_id(VALID_NONASCII_LEGACY_COURSE_ID))
def test_no_course_id(self): self.assertFalse(opaque_key_util.is_valid_course_id(None))
def test_empty_course_id(self): self.assertFalse(opaque_key_util.is_valid_course_id(''))
def test_invalid_course_id(self, course_id): self.assertFalse(opaque_key_util.is_valid_course_id(course_id))
def get_problem_check_event(line_or_event): """ Generates output values for explicit problem_check events. Args: line_or_event: pre-parsed event dict, or text line from a tracking event log Returns: (problem_id, username), (timestamp, problem_check_info) where timestamp is in ISO format, with resolution to the millisecond and problem_check_info is a JSON-serialized dict containing the contents of the problem_check event's 'event' field, augmented with entries for 'timestamp', 'username', and 'context' from the event. or None if there is no valid problem_check event on the line. Example: (i4x://edX/DemoX/Demo_Course/problem/PS1_P1, dummy_username), (2013-09-10T00:01:05.123456, blah) """ # Ensure the given event dict is a problem_check event if isinstance(line_or_event, dict): event = line_or_event if event.get('event_type') != 'problem_check': return None # Parse the line into an event dict, if not provided. else: event = eventlog.parse_json_server_event(line_or_event, 'problem_check') if event is None: return None # Get the "problem data". This is the event data, the context, and anything else that would # be useful further downstream. (We could just pass the entire event dict?) # Get the user from the username, not from the user_id in the # context. While we are currently requiring context (as described # above), we might not in future. Older events will not have # context information, so we can't rely on user_id from there. # And we don't expect problem_check events to occur without a # username, and don't expect them to occur with the wrong user # (i.e. one user acting on behalf of another, as in an instructor # acting on behalf of a student). augmented_data_fields = ['context', 'username', 'timestamp'] problem_data = eventlog.get_augmented_event_data(event, augmented_data_fields) if problem_data is None: return None # Get the course_id from context. We won't work with older events # that do not have context information, since they do not directly # provide course_id information. (The problem_id/answer_id values # contain the org and course name, but not the run.) Course_id # information could be found from other events, but it would # require expanding the events being selected. course_id = eventlog.get_course_id(event) if course_id is None: log.error("encountered explicit problem_check event with missing course_id: %s", event) return None if not opaque_key_util.is_valid_course_id(course_id): log.error("encountered explicit problem_check event with bogus course_id: %s", event) return None # Get the problem_id from the event data. problem_id = problem_data.get('problem_id') if problem_id is None: log.error("encountered explicit problem_check event with bogus problem_id: %s", event) return None event = event.get('event', {}) answers = event.get('answers', {}) if len(answers) == 0: return None try: _check_answer_ids(answers) _check_answer_ids(event.get('submission', {})) except (TypeError, ValueError): log.error("encountered explicit problem_check event with invalid answers: %s", event) return None problem_data_json = json.dumps(problem_data) key = (course_id, problem_id, problem_data.get('username')) value = (problem_data.get('timestamp'), problem_data_json) return key, value
def test_course_id_with_invalid_nonascii(self): self.assertFalse(opaque_key_util.is_valid_course_id(INVALID_NONASCII_LEGACY_COURSE_ID))
def test_valid_course_id(self, course_id): self.assertTrue(opaque_key_util.is_valid_course_id(course_id))
def test_course_id_with_nonascii(self): self.assertFalse(opaque_key_util.is_valid_course_id(NONASCII_LEGACY_COURSE_ID))
def test_newline_terminated_course_id(self): self.assertFalse( opaque_key_util.is_valid_course_id(VALID_COURSE_ID + '\n'))