def mapper(self, line): value = self.get_event_and_date_string(line) if value is None: return event, _date_string = value event_type = event.get('event_type') if event_type is None: log.error("encountered event with no event_type: %s", event) return if event_type not in (DEACTIVATED, ACTIVATED): return timestamp = eventlog.get_event_time_string(event) if timestamp is None: log.error("encountered event with bad timestamp: %s", event) return event_data = eventlog.get_event_data(event) if event_data is None: return course_id = event_data.get('course_id') if course_id is None or not opaque_key_util.is_valid_course_id(course_id): log.error("encountered explicit enrollment event with invalid course_id: %s", event) return user_id = event_data.get('user_id') if user_id is None: log.error("encountered explicit enrollment event with no user_id: %s", event) return yield (course_id, user_id), (timestamp, event_type)
def _obfuscate_event(self, event): """Obfuscate event by removing/stubbing user information.""" # Create a string to use when logging errors, to provide some context. debug_str = self._get_log_string_for_event(event) # Remap the user information stored in the event, and collect for later searching. event_data = eventlog.get_event_data(event) user_info = self._remap_user_info_in_event(event, event_data) # Clean or remove values from context. if 'context' in event: # These aren't present in current events, but are generated historically by some implicit events. event['context'].pop('host', None) event['context'].pop('ip', None) # Not sure how to clean this, so removing. event['context'].pop('path', None) # Clean out some of the more obvious values in (mobile) client: if 'client' in event['context']: event['context']['client'].pop('device', None) event['context']['client'].pop('ip', None) # Do remaining cleanup on the entire event payload (assuming user-based remapping has been done). if event_data: # Remove possibly sensitive payload fields (or fields we don't know how to clean). for key in [ 'certificate_id', 'certificate_url', 'source_url', 'fileName', 'GET', 'POST', 'requesting_student_id', 'report_url', 'url', 'url_name']: event_data.pop(key, None) for key in ['answer', 'saved_response']: if key in event_data and 'file_upload_key' in event_data[key]: event_data[key].pop('file_upload_key') # Clean up remaining event payload recursively. updated_event_data = self.obfuscator.obfuscate_structure(event_data, u"event", user_info) if updated_event_data is not None: if self.obfuscator.is_logging_enabled(): log.info(u"Obfuscated payload: %s", debug_str) event_data = updated_event_data # Re-encode payload as a json string if it originally was one. # (This test works because we throw away string values that didn't parse as JSON.) if isinstance(event.get('event'), basestring): event['event'] = cjson.encode(event_data) else: event['event'] = event_data ip_address = event.get('ip') # Skip over IPv6-format ip_address values for now. if ip_address and ':' not in ip_address: country_code = self.get_country_code(ip_address) event.update({'augmented': {'country_code': country_code}}) # Delete base properties other than username. for key in ['host', 'ip', 'page', 'referer']: event.pop(key, None) return event
def test_get_truncated_post_string_event_data(self): post = '{"POST": "not JSON because of truncation %s' post512 = post % ('a' * (512 + 2 - len(post))) self.assertEquals(len(post512), 512) item = {"event": post512} self.assertIsNone(eventlog.get_event_data(item)) self.assertEquals(self.mock_log.debug.call_count, 1) self.assertIn('truncated event value', self.mock_log.debug.call_args[0][0])
def get_raw_event(self, event_line): event = eventlog.parse_json_event(event_line) event_data = eventlog.get_event_data(event) if event_data is not None: event['event'] = event_data dump = json.dumps(event, sort_keys=True) encoded_dump = backslash_encode_value(dump) return encoded_dump
def mapper(self, line): value = self.get_event_and_date_string(line) if value is None: return event, _date_string = value event_type = event.get('event_type') if event_type is None: log.error("encountered event with no event_type: %s", event) return if event_type not in (DEACTIVATED, ACTIVATED, MODE_CHANGED, VALIDATED): return timestamp = eventlog.get_event_time_string(event) if timestamp is None: log.error("encountered event with bad timestamp: %s", event) return event_data = eventlog.get_event_data(event) if event_data is None: return course_id = event_data.get('course_id') if course_id is None or not opaque_key_util.is_valid_course_id( course_id): log.error( "encountered explicit enrollment event with invalid course_id: %s", event) return user_id = event_data.get('user_id') if user_id is None: log.error( "encountered explicit enrollment event with no user_id: %s", event) return mode = event_data.get('mode') if mode is None: log.error("encountered explicit enrollment event with no mode: %s", event) return # Pull in extra properties provided only by synthetic enrollment validation events. validation_info = None if 'dump_start' in event_data: validation_info = { 'is_active': event_data.get('is_active'), 'created': event_data.get('created'), 'dump_start': event_data.get('dump_start'), 'dump_end': event_data.get('dump_end'), } # Make sure key values that are strings are properly encoded. # Note, however, that user_id is an int. key = (unicode(course_id).encode('utf-8'), user_id) yield key, (timestamp, event_type, mode, validation_info)
def mapper(self, line): # We only want to consider lines that include the type of event with which we are concerned. if LINK_CLICKED not in line: return value = self.get_event_and_date_string(line) if value is None: return event, date_string = value event_type = event.get('event_type') if not event_type: log.error("encountered event with no event_type: %s", event) return if event_type != LINK_CLICKED: return event_data = eventlog.get_event_data(event) if event_data is None: log.error( "encountered explicit link_clicked event with no event data: %s", event) return course_id = eventlog.get_course_id(event) if course_id is None: log.error( "encountered explicit link_clicked event with invalid course_id: %s", event) return target_url = event_data.get('target_url') if not target_url: log.error( "encountered explicit link_clicked event with no target_url: %s", event) return current_url = event_data.get('current_url') if not current_url: log.error( "encountered explicit link_clicked event with no current_url: %s", event) return # A link is considered "internal" when it does not navigate away from the current host. # Some internal links exclude the host name entirely- they start with / so we account for that. current_loc = urlparse(current_url).netloc target_loc = urlparse(target_url).netloc is_external = 0 if current_loc != target_loc and target_loc != "": is_external = 1 yield (course_id, date_string), (is_external)
def test_get_truncated_post_string_event_data(self): post = '{"POST": "not JSON because of truncation %s' post512 = post % ('a' * (512 + 2 - len(post))) self.assertEquals(len(post512), 512) item = {"event": post512} self.assertIsNone(eventlog.get_event_data(item)) self.assertEquals(self.mock_log.debug.call_count, 1) self.assertIn('truncated event value', self.mock_log.debug.call_args[0][0])
def mapper(self, line): # We only want to consider lines that include the type of event with which we are concerned. if LINK_CLICKED not in line: return value = self.get_event_and_date_string(line) if value is None: return event, date_string = value event_type = event.get('event_type') if not event_type: log.error("encountered event with no event_type: %s", event) return if event_type != LINK_CLICKED: return event_data = eventlog.get_event_data(event) if event_data is None: log.error("encountered explicit link_clicked event with no event data: %s", event) return context = event.get('context') if not context: log.error("encountered explicit link_clicked event with no context: %s", event) return course_id = context.get('course_id') if course_id is None or not opaque_key_util.is_valid_course_id(course_id): log.error("encountered explicit link_clicked event with invalid course_id: %s", event) return target_url = event_data.get('target_url') if not target_url: log.error("encountered explicit link_clicked event with no target_url: %s", event) return current_url = event_data.get('current_url') if not current_url: log.error("encountered explicit link_clicked event with no current_url: %s", event) return # A link is considered "internal" when it does not navigate away from the current host. # Some internal links exclude the host name entirely- they start with / so we account for that. current_loc = urlparse(current_url).netloc target_loc = urlparse(target_url).netloc is_external = 0 if current_loc != target_loc and target_loc != "": is_external = 1 yield (course_id, date_string), (is_external)
def _obfuscate_event(self, event): """Obfuscate event by removing/stubbing user information.""" # Create a string to use when logging errors, to provide some context. debug_str = self._get_log_string_for_event(event) # Remap the user information stored in the event, and collect for later searching. event_data = eventlog.get_event_data(event) user_info = self._remap_user_info_in_event(event, event_data) # Clean or remove values from context. if 'context' in event: # These aren't present in current events, but are generated historically by some implicit events. event['context'].pop('host', None) event['context'].pop('ip', None) # Not sure how to clean this, so removing. event['context'].pop('path', None) # Clean out some of the more obvious values in (mobile) client: if 'client' in event['context']: event['context']['client'].pop('device', None) event['context']['client'].pop('ip', None) # Do remaining cleanup on the entire event payload (assuming user-based remapping has been done). if event_data: # Remove possibly sensitive payload fields (or fields we don't know how to clean). for key in [ 'certificate_id', 'certificate_url', 'source_url', 'fileName', 'GET', 'POST', 'requesting_student_id', 'report_url', 'url', 'url_name']: event_data.pop(key, None) for key in ['answer', 'saved_response']: if key in event_data and 'file_upload_key' in event_data[key]: event_data[key].pop('file_upload_key') # Clean up remaining event payload recursively. updated_event_data = self.obfuscator.obfuscate_structure(event_data, u"event", user_info) if updated_event_data is not None: if self.obfuscator.is_logging_enabled(): log.info(u"Obfuscated payload: %s", debug_str) event_data = updated_event_data # Re-encode payload as a json string if it originally was one. # (This test works because we throw away string values that didn't parse as JSON.) if isinstance(event.get('event'), basestring): event['event'] = cjson.encode(event_data) else: event['event'] = event_data # Delete base properties other than username. for key in ['host', 'ip', 'page', 'referer']: event.pop(key, None) return event
def mapper(self, line): value = self.get_event_and_date_string(line) if value is None: return event, _date_string = value event_type = event.get('event_type') if event_type is None: log.error("encountered event with no event_type: %s", event) return if event_type not in (DEACTIVATED, ACTIVATED, MODE_CHANGED, VALIDATED): return timestamp = eventlog.get_event_time_string(event) if timestamp is None: log.error("encountered event with bad timestamp: %s", event) return event_data = eventlog.get_event_data(event) if event_data is None: return course_id = event_data.get('course_id') if course_id is None or not opaque_key_util.is_valid_course_id(course_id): log.error("encountered explicit enrollment event with invalid course_id: %s", event) return user_id = event_data.get('user_id') if user_id is None: log.error("encountered explicit enrollment event with no user_id: %s", event) return mode = event_data.get('mode') if mode is None: log.error("encountered explicit enrollment event with no mode: %s", event) return # Pull in extra properties provided only by synthetic enrollment validation events. validation_info = None if 'dump_start' in event_data: validation_info = { 'is_active': event_data.get('is_active'), 'created': event_data.get('created'), 'dump_start': event_data.get('dump_start'), 'dump_end': event_data.get('dump_end'), } # Make sure key values that are strings are properly encoded. # Note, however, that user_id is an int. key = (unicode(course_id).encode('utf-8'), user_id) yield key, (timestamp, event_type, mode, validation_info)
def mapper(self, line): value = self.get_event_and_date_string(line) if value is None: return event, date_string = value event_type = event.get('event_type') if event_type is None: log.error("encountered event with no event_type: %s", event) return if event_type not in (DEACTIVATED, ACTIVATED, MODE_CHANGED): return timestamp = eventlog.get_event_time_string(event) if timestamp is None: log.error("encountered event with bad timestamp: %s", event) return event_data = eventlog.get_event_data(event) if event_data is None: return course_id = opaque_key_util.normalize_course_id( event_data.get('course_id')) if course_id is None or not opaque_key_util.is_valid_course_id( course_id): log.error( "encountered explicit enrollment event with invalid course_id: %s", event) return user_id = event_data.get('user_id') if user_id is None: log.error( "encountered explicit enrollment event with no user_id: %s", event) return mode = event_data.get('mode') if mode is None: log.error("encountered explicit enrollment event with no mode: %s", event) return yield date_string, (course_id, user_id, timestamp, event_type, mode)
def deidentify_event(self, event): """Deidentify event by removing/stubbing user information.""" if 'context' in event and 'user_id' in event['context'] and event['context']['user_id']: # found '' as user_id event['context']['user_id'] = self.remap_id(event['context']['user_id']) event.pop('host', None) # delete host event.pop('ip', None) # delete ip if event['username']: try: event['username'] = self.generate_deid_username_from_user_id(self.username_map[event['username']]) except KeyError: log.error("Unable to find username: %s in the username_map", event['username']) return None event_data = eventlog.get_event_data(event) if event_data: if 'user_id' in event_data and event_data['user_id']: event_data['user_id'] = self.remap_id(event_data['user_id']) if 'username' in event_data: # How to DRY ? try: event_data['username'] = self.generate_deid_username_from_user_id(self.username_map[event_data['username']]) except KeyError: log.error("Unable to find username: %s in the username_map", event_data['username']) return None for key in ['certificate_id', 'certificate_url', 'source_url', 'fileName', 'GET', 'POST']: event_data.pop(key, None) for key in ['answer', 'saved_response']: if key in event_data and 'file_upload_key' in event_data[key]: event_data[key].pop('file_upload_key') # TODO: clean ? if isinstance(event.get('event'), basestring): # re-encode as a json string if originally event['event'] = cjson.encode(event_data) else: event['event'] = event_data return event
def mapper(self, line): value = self.get_event_and_date_string(line) if value is None: return event, date_string = value username = event.get('username', '').strip() if not username: return event_type = event.get('event_type') if event_type is None: return course_id = eventlog.get_course_id(event) if not course_id: return event_data = eventlog.get_event_data(event) if event_data is None: return event_source = event.get('event_source') entity_id, entity_type, user_actions = self.get_user_actions_from_event(event_data, event_source, event_type) if not entity_id or not entity_type: return for action in user_actions: record = ModuleEngagementRecord( course_id=course_id, username=username, date=DateField().deserialize_from_string(date_string), entity_type=entity_type, entity_id=entity_id, event=action, count=0 ) # The count is populated by the reducer, so exclude it from the key. record_without_count = record.to_string_tuple()[:-1] yield (record_without_count, 1)
def mapper(self, line): """ Args: line: text line from a tracking event log. Yields: (course_id, org_id, problem_id), (timestamp, saved_tags, is_correct) """ value = self.get_event_and_date_string(line) if value is None: return event, _ = value if event.get('event_type') != 'problem_check' or event.get( 'event_source') != 'server': return timestamp = eventlog.get_event_time_string(event) if timestamp is None: return course_id = eventlog.get_course_id(event) if not course_id: return org_id = opaque_key_util.get_org_id_for_course(course_id) event_data = eventlog.get_event_data(event) if event_data is None: return problem_id = event_data.get('problem_id') if not problem_id: return is_correct = event_data.get('success') == 'correct' saved_tags = event.get('context').get('asides', {}).get( 'tagging_aside', {}).get('saved_tags', {}) yield (course_id, org_id, problem_id), (timestamp, saved_tags, is_correct)
def mapper(self, line): """ Args: line: text line from a tracking event log. Yields: (course_id, org_id, problem_id), (timestamp, saved_tags, is_correct) """ value = self.get_event_and_date_string(line) if value is None: return event, _ = value if event.get('event_type') != 'problem_check' or event.get('event_source') != 'server': return timestamp = eventlog.get_event_time_string(event) if timestamp is None: return course_id = eventlog.get_course_id(event) if not course_id: return org_id = opaque_key_util.get_org_id_for_course(course_id) event_data = eventlog.get_event_data(event) if event_data is None: return problem_id = event_data.get('problem_id') if not problem_id: return is_correct = event_data.get('success') == 'correct' saved_tags = event.get('context').get('asides', {}).get('tagging_aside', {}).get('saved_tags', {}) yield (course_id, org_id, problem_id), (timestamp, saved_tags, is_correct)
def mapper(self, line): event, date_string = self.get_event_and_date_string(line) or (None, None) if event is None: return if not self.is_valid_input_file(): return org_id = self.get_org_id(event) if org_id not in self.org_id_whitelist: log.debug('Unrecognized organization: org_id=%s', org_id or '') return # Do not export events that have been explicitly flagged as not being for export. # Any event without an '_export' key will be sent as part of the export by default, # and likewise any event without a falsey value. The preferred value to not export is 'false'. event_data = eventlog.get_event_data(event) if event_data and str(event_data.get('_export', 'true')).lower() in ('n', 'f', '0', 'false', 'no'): return # Check to see if the org_id is one that should be grouped with other org_ids. org_ids = self.primary_org_ids_for_org_id[org_id] for key_org_id in org_ids: key = (date_string, key_org_id) # Include only requested courses requested_courses = self.courses_for_org_id.get(key_org_id) if requested_courses and eventlog.get_course_id(event, from_url=True) not in requested_courses: continue # Enforce a standard encoding for the parts of the key. Without this a part of the key # might appear differently in the key string when it is coerced to a string by luigi. For example, # if the same org_id appears in two different records, one as a str() type and the other a # unicode() then without this change they would appear as u'FooX' and 'FooX' in the final key # string. Although python doesn't care about this difference, hadoop does, and will bucket the # values separately. Which is not what we want. yield tuple([value.encode('utf8') for value in key]), line.strip()
def test_event_data_with_list_type(self): item = {"event": ["a list", "of strings"]} self.assertIsNone(eventlog.get_event_data(item)) self.assertEquals(self.mock_log.debug.call_count, 1) self.assertIn('value of type list', self.mock_log.debug.call_args[0][0])
def mapper(self, line): # Add a filter here to permit quicker rejection of unrelated events. if VIDEO_EVENT_MINIMUM_STRING not in line: # self.incr_counter(self.counter_category_name, 'Discard Missing Video String', 1) return value = self.get_event_and_date_string(line) if value is None: return event, _date_string = value # self.incr_counter(self.counter_category_name, 'Inputs with Dates', 1) event_type = event.get('event_type') if event_type is None: log.error("encountered event with no event_type: %s", event) self.incr_counter(self.counter_category_name, 'Discard Missing Event Type', 1) return if event_type not in VIDEO_EVENT_TYPES: # self.incr_counter(self.counter_category_name, 'Discard Non-Video Event Type', 1) return # self.incr_counter(self.counter_category_name, 'Input Video Events', 1) # This has already been checked when getting the event, so just fetch the value. timestamp = eventlog.get_event_time_string(event) # Strip username to remove trailing newlines that mess up Luigi. username = event.get('username', '').strip() if not username: log.error("Video event without username: %s", event) ## self.incr_counter(self.counter_category_name, 'Discard Video Missing Something', 1) ## self.incr_counter(self.counter_category_name, 'Discard Video Missing username', 1) return course_id = eventlog.get_course_id(event) if course_id is None: log.warn('Video event without valid course_id: {0}'.format(line)) ## self.incr_counter(self.counter_category_name, 'Discard Video Missing Something', 1) ## self.incr_counter(self.counter_category_name, 'Discard Video Missing course_id', 1) return event_data = eventlog.get_event_data(event) if event_data is None: # This should already have been logged. ## self.incr_counter(self.counter_category_name, 'Discard Video Missing Something', 1) ## self.incr_counter(self.counter_category_name, 'Discard Video Missing Event Data', 1) return encoded_module_id = event_data.get( 'id', '').strip() # we have seen id values with leading newline if not encoded_module_id: log.warn( 'Video event without valid encoded_module_id (id): {0}'.format( line)) ## self.incr_counter(self.counter_category_name, 'Discard Video Missing Something', 1) ## self.incr_counter(self.counter_category_name, 'Discard Video Missing encoded_module_id', 1) return video_duration = event_data.get('duration', VIDEO_UNKNOWN_DURATION) if not video_duration: # events may have a 'duration' value of null, so use the same default for those as well. video_duration = VIDEO_UNKNOWN_DURATION # self.incr_counter(self.counter_category_name, 'Video Events Before Time Check', 1) current_time = None old_time = None youtube_id = None if event_type == VIDEO_PLAYED: code = event_data.get('code') if code not in VIDEO_CODES: youtube_id = code current_time = self._check_time_offset( event_data.get('currentTime'), line) if current_time is None: ## self.incr_counter(self.counter_category_name, 'Discard Video Missing Something', 1) ## self.incr_counter(self.counter_category_name, 'Discard Video Missing Time', 1) ## self.incr_counter(self.counter_category_name, 'Discard Video Missing Time From Play', 1) return ### self.incr_counter(self.counter_category_name, 'Subset Play', 1) elif event_type == VIDEO_PAUSED: # Pause events may have a missing currentTime value if video is paused at the beginning, # so provide a default of zero. current_time = self._check_time_offset( event_data.get('currentTime', 0), line) if current_time is None: ## self.incr_counter(self.counter_category_name, 'Discard Video Missing Something', 1) ## self.incr_counter(self.counter_category_name, 'Discard Video Missing Time', 1) ## self.incr_counter(self.counter_category_name, 'Discard Video Missing Time From Pause', 1) return ### self.incr_counter(self.counter_category_name, 'Subset Pause', 1) elif event_type == VIDEO_SEEK: current_time = self._check_time_offset(event_data.get('new_time'), line) old_time = self._check_time_offset(event_data.get('old_time'), line) if current_time is None or old_time is None: ## self.incr_counter(self.counter_category_name, 'Discard Video Missing Something', 1) ## self.incr_counter(self.counter_category_name, 'Discard Video Missing Time', 1) ## self.incr_counter(self.counter_category_name, 'Discard Video Missing Time From Seek', 1) return ### self.incr_counter(self.counter_category_name, 'Subset Seek', 1) elif event_type == VIDEO_STOPPED: current_time = self._check_time_offset( event_data.get('currentTime'), line) if current_time is None: ## self.incr_counter(self.counter_category_name, 'Discard Video Missing Something', 1) ## self.incr_counter(self.counter_category_name, 'Discard Video Missing Time', 1) ## self.incr_counter(self.counter_category_name, 'Discard Video Missing Time From Stop', 1) return ### self.incr_counter(self.counter_category_name, 'Subset Stop', 1) if youtube_id is not None: youtube_id = youtube_id.encode('utf8') # self.incr_counter(self.counter_category_name, 'Output Video Events from Mapper', 1) yield ((username.encode('utf8'), course_id.encode('utf8'), encoded_module_id.encode('utf8')), (timestamp, event_type, current_time, old_time, youtube_id, video_duration))
def mapper(self, line): value = self.get_event_and_date_string(line) if value is None: return event, date_string = value username = event.get('username', '').strip() if not username: return event_type = event.get('event_type') if event_type is None: return course_id = eventlog.get_course_id(event) if not course_id: return event_data = eventlog.get_event_data(event) if event_data is None: return event_source = event.get('event_source') entity_id = '' info = {} forum_post_voted = None if event_type == 'problem_check': if event_source != 'server': return problem_id = event_data.get('problem_id') if not problem_id: return entity_id = problem_id if event_data.get('success', 'incorrect').lower() == 'correct': info['correct'] = True elif event_type == 'play_video': encoded_module_id = event_data.get('id') if not encoded_module_id: return entity_id = encoded_module_id elif event_type[:9] == '/courses/' and re.match( self.SUBSECTION_ACCESSED_PATTERN, event_type): timestamp = eventlog.get_event_time_string(event) if timestamp is None: return info['path'] = event_type info['timestamp'] = timestamp event_type = SUBSECTION_VIEWED_MARKER elif event_type.startswith('edx.forum'): forum_post_voted = re.match( r'edx\.forum\.(?P<post_type>\w+)\.voted', event_type) if forum_post_voted: info['vote_value'] = event_data.get('vote_value') if info['vote_value'] not in ['up', 'down']: return info['undo_vote'] = event_data.get('undo_vote', False) date_grouping_key = date_string if self.interval_type == 'weekly': last_complete_date = self.interval.date_b - datetime.timedelta( days=1) # pylint: disable=no-member last_weekday = last_complete_date.isoweekday() split_date = date_string.split('-') event_date = datetime.date(int(split_date[0]), int(split_date[1]), int(split_date[2])) event_weekday = event_date.isoweekday() days_until_end = last_weekday - event_weekday if days_until_end < 0: days_until_end += 7 end_of_week_date = event_date + datetime.timedelta( days=days_until_end) date_grouping_key = end_of_week_date.isoformat() elif self.interval_type == 'all': # If gathering all data for a given user, use the last complete day of the interval # for joining with enrollment. last_complete_date = self.interval.date_b - datetime.timedelta( days=1) # pylint: disable=no-member date_grouping_key = last_complete_date.isoformat() yield ((date_grouping_key, course_id, username), (entity_id, event_type, json.dumps(info), date_string)) if forum_post_voted: # We emit two events for each "voted" event - one for the voting user and one for the # user receiving the vote. username = event_data.get('target_username') if not username: return event_type = 'edx.forum.{}.vote_received'.format( forum_post_voted.group('post_type')) yield ((date_grouping_key, course_id, username), (entity_id, event_type, json.dumps(info), date_string))
def mapper(self, line): # Add a filter here to permit quicker rejection of unrelated events. if VIDEO_EVENT_MINIMUM_STRING not in line: # self.incr_counter(self.counter_category_name, 'Discard Missing Video String', 1) return value = self.get_event_and_date_string(line) if value is None: return event, _date_string = value # self.incr_counter(self.counter_category_name, 'Inputs with Dates', 1) event_type = event.get('event_type') if event_type is None: log.error("encountered event with no event_type: %s", event) self.incr_counter(self.counter_category_name, 'Discard Missing Event Type', 1) return if event_type not in VIDEO_EVENT_TYPES: # self.incr_counter(self.counter_category_name, 'Discard Non-Video Event Type', 1) return # self.incr_counter(self.counter_category_name, 'Input Video Events', 1) # This has already been checked when getting the event, so just fetch the value. timestamp = eventlog.get_event_time_string(event) user_id = event.get('context', {}).get('user_id') if not user_id: log.error("Video event without user_id in context: %s", event) return # Convert user_id to int if str if not isinstance(user_id, int): user_id = int(user_id) course_id = eventlog.get_course_id(event) if course_id is None: log.warn('Video event without valid course_id: {0}'.format(line)) # Slow: self.incr_counter(self.counter_category_name, 'Discard Video Missing Something', 1) # Slow: self.incr_counter(self.counter_category_name, 'Discard Video Missing course_id', 1) return event_data = eventlog.get_event_data(event) if event_data is None: # This should already have been logged. # Slow: self.incr_counter(self.counter_category_name, 'Discard Video Missing Something', 1) # Slow: self.incr_counter(self.counter_category_name, 'Discard Video Missing Event Data', 1) return encoded_module_id = event_data.get('id', '').strip() # we have seen id values with leading newline if not encoded_module_id: log.warn('Video event without valid encoded_module_id (id): {0}'.format(line)) # Slow: self.incr_counter(self.counter_category_name, 'Discard Video Missing Something', 1) # Slow: self.incr_counter(self.counter_category_name, 'Discard Video Missing encoded_module_id', 1) return video_duration = event_data.get('duration', VIDEO_UNKNOWN_DURATION) if not video_duration: # events may have a 'duration' value of null, so use the same default for those as well. video_duration = VIDEO_UNKNOWN_DURATION # self.incr_counter(self.counter_category_name, 'Video Events Before Time Check', 1) current_time = None old_time = None youtube_id = None if event_type == VIDEO_PLAYED: code = event_data.get('code') if code not in VIDEO_CODES: youtube_id = code current_time = self._check_time_offset(event_data.get('currentTime'), line) if current_time is None: # Slow: self.incr_counter(self.counter_category_name, 'Discard Video Missing Something', 1) # Slow: self.incr_counter(self.counter_category_name, 'Discard Video Missing Time', 1) # Slow: self.incr_counter(self.counter_category_name, 'Discard Video Missing Time From Play', 1) return # Slow: self.incr_counter(self.counter_category_name, 'Subset Play', 1) elif event_type == VIDEO_PAUSED: # Pause events may have a missing currentTime value if video is paused at the beginning, # so provide a default of zero. current_time = self._check_time_offset(event_data.get('currentTime', 0), line) if current_time is None: # Slow: self.incr_counter(self.counter_category_name, 'Discard Video Missing Something', 1) # Slow: self.incr_counter(self.counter_category_name, 'Discard Video Missing Time', 1) # Slow: self.incr_counter(self.counter_category_name, 'Discard Video Missing Time From Pause', 1) return # Slow: self.incr_counter(self.counter_category_name, 'Subset Pause', 1) elif event_type == VIDEO_SEEK: current_time = self._check_time_offset(event_data.get('new_time'), line) old_time = self._check_time_offset(event_data.get('old_time'), line) if current_time is None or old_time is None: # Slow: self.incr_counter(self.counter_category_name, 'Discard Video Missing Something', 1) # Slow: self.incr_counter(self.counter_category_name, 'Discard Video Missing Time', 1) # Slow: self.incr_counter(self.counter_category_name, 'Discard Video Missing Time From Seek', 1) return # Slow: self.incr_counter(self.counter_category_name, 'Subset Seek', 1) elif event_type == VIDEO_STOPPED: current_time = self._check_time_offset(event_data.get('currentTime'), line) if current_time is None: # Slow: self.incr_counter(self.counter_category_name, 'Discard Video Missing Something', 1) # Slow: self.incr_counter(self.counter_category_name, 'Discard Video Missing Time', 1) # Slow: self.incr_counter(self.counter_category_name, 'Discard Video Missing Time From Stop', 1) return # Slow: self.incr_counter(self.counter_category_name, 'Subset Stop', 1) if youtube_id is not None: youtube_id = youtube_id.encode('utf8') # self.incr_counter(self.counter_category_name, 'Output Video Events from Mapper', 1) yield ( (user_id, course_id.encode('utf8'), encoded_module_id.encode('utf8')), (timestamp, event_type, current_time, old_time, youtube_id, video_duration) )
def test_get_bad_string_event_data(self): item = {"event": "a string but not JSON"} self.assertIsNone(eventlog.get_event_data(item)) self.assertEquals(self.mock_log.error.call_count, 1) self.assertIn('unparsable event value', self.mock_log.error.call_args[0][0])
def test_get_dict_event_data(self): item = {"event": {"a dict": "that has strings"}} self.assertEquals(eventlog.get_event_data(item), {"a dict": "that has strings"}) self.assertEquals(self.mock_log.error.call_count, 0) self.assertEquals(self.mock_log.debug.call_count, 0)
def test_missing_event_data(self): item = {"something else": "not an event"} self.assertIsNone(eventlog.get_event_data(item)) self.assertEquals(self.mock_log.error.call_count, 1) self.assertIn('missing event value', self.mock_log.error.call_args[0][0])
def test_get_key_value_string_event_data(self): item = {"event": "a=3,b=4"} self.assertIsNone(eventlog.get_event_data(item)) self.assertEquals(self.mock_log.debug.call_count, 1) self.assertIn('key-value pairs', self.mock_log.debug.call_args[0][0])
def mapper(self, line): value = self.get_event_and_date_string(line) if value is None: return event, date_string = value username = event.get('username', '').strip() if not username: return event_type = event.get('event_type') if event_type is None: return course_id = eventlog.get_course_id(event) if not course_id: return event_data = eventlog.get_event_data(event) if event_data is None: return event_source = event.get('event_source') entity_id = '' info = {} forum_post_voted = None if event_type == 'problem_check': if event_source != 'server': return problem_id = event_data.get('problem_id') if not problem_id: return entity_id = problem_id if event_data.get('success', 'incorrect').lower() == 'correct': info['correct'] = True elif event_type == 'play_video': encoded_module_id = event_data.get('id') if not encoded_module_id: return entity_id = encoded_module_id elif event_type[:9] == '/courses/' and re.match(self.SUBSECTION_ACCESSED_PATTERN, event_type): timestamp = eventlog.get_event_time_string(event) if timestamp is None: return info['path'] = event_type info['timestamp'] = timestamp event_type = SUBSECTION_VIEWED_MARKER elif event_type.startswith('edx.forum'): forum_post_voted = re.match(r'edx\.forum\.(?P<post_type>\w+)\.voted', event_type) if forum_post_voted: info['vote_value'] = event_data.get('vote_value') if info['vote_value'] not in ['up', 'down']: return info['undo_vote'] = event_data.get('undo_vote', False) date_grouping_key = date_string if self.interval_type == 'weekly': last_complete_date = self.interval.date_b - datetime.timedelta(days=1) # pylint: disable=no-member last_weekday = last_complete_date.isoweekday() split_date = date_string.split('-') event_date = datetime.date(int(split_date[0]), int(split_date[1]), int(split_date[2])) event_weekday = event_date.isoweekday() days_until_end = last_weekday - event_weekday if days_until_end < 0: days_until_end += 7 end_of_week_date = event_date + datetime.timedelta(days=days_until_end) date_grouping_key = end_of_week_date.isoformat() elif self.interval_type == 'all': # If gathering all data for a given user, use the last complete day of the interval # for joining with enrollment. last_complete_date = self.interval.date_b - datetime.timedelta(days=1) # pylint: disable=no-member date_grouping_key = last_complete_date.isoformat() yield ((date_grouping_key, course_id, username), (entity_id, event_type, json.dumps(info), date_string)) if forum_post_voted: # We emit two events for each "voted" event - one for the voting user and one for the # user receiving the vote. username = event_data.get('target_username') if not username: return event_type = 'edx.forum.{}.vote_received'.format(forum_post_voted.group('post_type')) yield ((date_grouping_key, course_id, username), (entity_id, event_type, json.dumps(info), date_string))
def test_missing_event_data(self): item = {"something else": "not an event"} self.assertIsNone(eventlog.get_event_data(item))
def test_get_dict_event_data(self): item = {"event": {"a dict": "that has strings"}} self.assertEquals(eventlog.get_event_data(item), {"a dict": "that has strings"})
def test_event_data_with_unknown_type(self): item = {"event": ["a list", "of strings"]} self.assertIsNone(eventlog.get_event_data(item))
def test_get_json_string_event_data(self): item = {"event": '{ "a string": "that is JSON"}'} self.assertEquals(eventlog.get_event_data(item), {"a string": "that is JSON"})
def test_get_empty_string_event_data(self): item = {"event": ''} self.assertEquals(eventlog.get_event_data(item), {})
def mapper(self, line): value = self.get_event_and_date_string(line) if value is None: return event, date_string = value username = event.get('username', '').strip() if not username: return event_type = event.get('event_type') if event_type is None: return course_id = eventlog.get_course_id(event) if not course_id: return event_data = eventlog.get_event_data(event) if event_data is None: return event_source = event.get('event_source') entity_id = '' info = {} if event_type == 'problem_check': if event_source != 'server': return problem_id = event_data.get('problem_id') if not problem_id: return entity_id = problem_id if event_data.get('success', 'incorrect').lower() == 'correct': info['correct'] = True elif event_type == 'play_video': encoded_module_id = event_data.get('id') if not encoded_module_id: return entity_id = encoded_module_id elif event_type[:9] == '/courses/' and re.match(self.SUBSECTION_ACCESSED_PATTERN, event_type): timestamp = eventlog.get_event_time_string(event) if timestamp is None: return info['path'] = event_type info['timestamp'] = timestamp event_type = SUBSECTION_VIEWED_MARKER date_grouping_key = date_string if self.interval_type == 'weekly': last_complete_date = self.interval.date_b - datetime.timedelta(days=1) # pylint: disable=no-member last_weekday = last_complete_date.isoweekday() split_date = date_string.split('-') event_date = datetime.date(int(split_date[0]), int(split_date[1]), int(split_date[2])) event_weekday = event_date.isoweekday() days_until_end = last_weekday - event_weekday if days_until_end < 0: days_until_end += 7 end_of_week_date = event_date + datetime.timedelta(days=days_until_end) date_grouping_key = end_of_week_date.isoformat() elif self.interval_type == 'all': # If gathering all data for a given user, use the last complete day of the interval # for joining with enrollment. last_complete_date = self.interval.date_b - datetime.timedelta(days=1) # pylint: disable=no-member date_grouping_key = last_complete_date.isoformat() yield ((date_grouping_key, course_id, username), (entity_id, event_type, json.dumps(info), date_string))
def test_event_data_with_unknown_type(self): item = {"event": 1} self.assertIsNone(eventlog.get_event_data(item)) self.assertEquals(self.mock_log.error.call_count, 1) self.assertIn('unrecognized type', self.mock_log.error.call_args[0][0])
def test_get_json_string_event_data(self): item = {"event": '{ "a string": "that is JSON"}'} self.assertEquals(eventlog.get_event_data(item), {"a string": "that is JSON"}) self.assertEquals(self.mock_log.error.call_count, 0) self.assertEquals(self.mock_log.debug.call_count, 0)
def test_get_json_string_event_data(self): item = {"event": '{ "a string": "that is JSON"}'} self.assertEquals(eventlog.get_event_data(item), {"a string": "that is JSON"}) self.assertEquals(self.mock_log.error.call_count, 0) self.assertEquals(self.mock_log.debug.call_count, 0)
def mapper(self, line): # Add a filter here to permit quicker rejection of unrelated events. if VIDEO_EVENT_MINIMUM_STRING not in line: return value = self.get_event_and_date_string(line) if value is None: return event, _date_string = value event_type = event.get('event_type') if event_type is None: log.error("encountered event with no event_type: %s", event) return if event_type not in VIDEO_EVENT_TYPES: return # This has already been checked when getting the event, so just fetch the value. timestamp = eventlog.get_event_time_string(event) # Strip username to remove trailing newlines that mess up Luigi. username = event.get('username', '').strip() if not username: log.error("Video event without username: %s", event) return course_id = eventlog.get_course_id(event) if course_id is None: log.warn('Video event without valid course_id: {0}'.format(line)) return event_data = eventlog.get_event_data(event) if event_data is None: # This should already have been logged. return encoded_module_id = event_data.get('id') if encoded_module_id is None: log.warn('Video event without valid encoded_module_id (id): {0}'.format(line)) return current_time = None old_time = None youtube_id = None if event_type == VIDEO_PLAYED: code = event_data.get('code') if code not in ('html5', 'mobile'): youtube_id = code current_time = self._check_time_offset(event_data.get('currentTime'), line) if current_time is None: return elif event_type == VIDEO_PAUSED: # Pause events may have a missing currentTime value if video is paused at the beginning, # so provide a default of zero. current_time = self._check_time_offset(event_data.get('currentTime', 0), line) if current_time is None: return elif event_type == VIDEO_SEEK: current_time = self._check_time_offset(event_data.get('new_time'), line) old_time = self._check_time_offset(event_data.get('old_time'), line) if current_time is None or old_time is None: return elif event_type == VIDEO_STOPPED: current_time = self._check_time_offset(event_data.get('currentTime'), line) if current_time is None: return if youtube_id is not None: youtube_id = youtube_id.encode('utf8') yield ( (username.encode('utf8'), course_id.encode('utf8'), encoded_module_id.encode('utf8')), (timestamp, event_type, current_time, old_time, youtube_id) )
def test_event_data_with_list_type(self): item = {"event": ["a list", "of strings"]} self.assertIsNone(eventlog.get_event_data(item)) self.assertEquals(self.mock_log.debug.call_count, 1) self.assertIn('value of type list', self.mock_log.debug.call_args[0][0])
def test_get_empty_string_event_data(self): item = {"event": ''} self.assertEquals(eventlog.get_event_data(item), {}) self.assertEquals(self.mock_log.error.call_count, 0) self.assertEquals(self.mock_log.debug.call_count, 0)
def test_event_data_with_unknown_type(self): item = {"event": 1} self.assertIsNone(eventlog.get_event_data(item)) self.assertEquals(self.mock_log.error.call_count, 1) self.assertIn('unrecognized type', self.mock_log.error.call_args[0][0])
def mapper(self, line): value = self.get_event_and_date_string(line) if value is None: return event, date_string = value username = event.get('username', '').strip() if not username: return event_type = event.get('event_type') if event_type is None: return course_id = eventlog.get_course_id(event) if not course_id: return event_data = eventlog.get_event_data(event) if event_data is None: return event_source = event.get('event_source') entity_id = '' info = {} if event_type == 'problem_check': if event_source != 'server': return problem_id = event_data.get('problem_id') if not problem_id: return entity_id = problem_id if event_data.get('success', 'incorrect').lower() == 'correct': info['correct'] = True elif event_type == 'play_video': encoded_module_id = event_data.get('id') if not encoded_module_id: return entity_id = encoded_module_id elif event_type[:9] == '/courses/' and re.match( self.SUBSECTION_ACCESSED_PATTERN, event_type): timestamp = eventlog.get_event_time_string(event) if timestamp is None: return info['path'] = event_type info['timestamp'] = timestamp event_type = SUBSECTION_VIEWED_MARKER date_grouping_key = date_string if self.interval_type == 'weekly': last_complete_date = self.interval.date_b - datetime.timedelta( days=1) # pylint: disable=no-member last_weekday = last_complete_date.isoweekday() split_date = date_string.split('-') event_date = datetime.date(int(split_date[0]), int(split_date[1]), int(split_date[2])) event_weekday = event_date.isoweekday() days_until_end = last_weekday - event_weekday if days_until_end < 0: days_until_end += 7 end_of_week_date = event_date + datetime.timedelta( days=days_until_end) date_grouping_key = end_of_week_date.isoformat() elif self.interval_type == 'all': # If gathering all data for a given user, use the last complete day of the interval # for joining with enrollment. last_complete_date = self.interval.date_b - datetime.timedelta( days=1) # pylint: disable=no-member date_grouping_key = last_complete_date.isoformat() yield ((date_grouping_key, course_id, username), (entity_id, event_type, json.dumps(info), date_string))
def get_explicit_enrollment_output(line): """ Generates output values for explicit enrollment events. Args: line: text line from a tracking event log. Returns: (course_id, user_id), (timestamp, action_value) where action_value = 1 (enrolled) or -1 (unenrolled) and timestamp is in ISO format, with resolution to the millisecond. or None if there is no valid enrollment event on the line. Example: (edX/DemoX/Demo_Course, dummy_userid), (2013-09-10T00:01:05.123456, 1) """ # Before parsing, check that the line contains something that # suggests it's an enrollment event. if 'edx.course.enrollment' not in line: return None # try to parse the line into a dict: event = eventlog.parse_json_event(line) if event is None: # The line didn't parse. For this specific purpose, # we can assume that all enrollment-related lines would parse, # and these non-parsing lines would get skipped anyway. return None # get event type, and check that it exists: event_type = event.get('event_type') if event_type is None: log.error("encountered event with no event_type: %s", event) return None # convert the type to a value: if event_type == 'edx.course.enrollment.activated': action_value = ENROLLED elif event_type == 'edx.course.enrollment.deactivated': action_value = UNENROLLED else: # not an enrollment event... return None # get the timestamp: datetime = eventlog.get_event_time(event) if datetime is None: log.error("encountered event with bad datetime: %s", event) return None timestamp = eventlog.datetime_to_timestamp(datetime) # Use the `user_id` from the event `data` field, since the # `user_id` in the `context` field is the user who made the # request but not necessarily the one who got enrolled. (The # `course_id` should be the same in `context` as in `data`.) # Get the event data: event_data = eventlog.get_event_data(event) if event_data is None: # Assume it's already logged (and with more specifics). return None # Get the course_id from the data, and validate. course_id = event_data['course_id'] if not opaque_key_util.is_valid_course_id(course_id): log.error("encountered explicit enrollment event with bogus course_id: %s", event) return None # Get the user_id from the data: user_id = event_data.get('user_id') if user_id is None: log.error("encountered explicit enrollment event with no user_id: %s", event) return None # For now, ignore the enrollment 'mode' (e.g. 'honor'). return (course_id, user_id), (timestamp, action_value)
def get_explicit_enrollment_output(line): """ Generates output values for explicit enrollment events. Args: line: text line from a tracking event log. Returns: (course_id, user_id), (timestamp, action_value) where action_value = 1 (enrolled) or -1 (unenrolled) and timestamp is in ISO format, with resolution to the millisecond. or None if there is no valid enrollment event on the line. Example: (edX/DemoX/Demo_Course, dummy_userid), (2013-09-10T00:01:05.123456, 1) """ # Before parsing, check that the line contains something that # suggests it's an enrollment event. if 'edx.course.enrollment' not in line: return None # try to parse the line into a dict: event = eventlog.parse_json_event(line) if event is None: # The line didn't parse. For this specific purpose, # we can assume that all enrollment-related lines would parse, # and these non-parsing lines would get skipped anyway. return None # get event type, and check that it exists: event_type = event.get('event_type') if event_type is None: log.error("encountered event with no event_type: %s", event) return None # convert the type to a value: if event_type == 'edx.course.enrollment.activated': action_value = ENROLLED elif event_type == 'edx.course.enrollment.deactivated': action_value = UNENROLLED else: # not an enrollment event... return None # get the timestamp: datetime = eventlog.get_event_time(event) if datetime is None: log.error("encountered event with bad datetime: %s", event) return None timestamp = eventlog.datetime_to_timestamp(datetime) # Use the `user_id` from the event `data` field, since the # `user_id` in the `context` field is the user who made the # request but not necessarily the one who got enrolled. (The # `course_id` should be the same in `context` as in `data`.) # Get the event data: event_data = eventlog.get_event_data(event) if event_data is None: # Assume it's already logged (and with more specifics). return None # Get the course_id from the data, and validate. course_id = event_data['course_id'] if not opaque_key_util.is_valid_course_id(course_id): log.error( "encountered explicit enrollment event with bogus course_id: %s", event) return None # Get the user_id from the data: user_id = event_data.get('user_id') if user_id is None: log.error("encountered explicit enrollment event with no user_id: %s", event) return None # For now, ignore the enrollment 'mode' (e.g. 'honor'). return (course_id, user_id), (timestamp, action_value)
def test_get_dict_event_data(self): item = {"event": {"a dict": "that has strings"}} self.assertEquals(eventlog.get_event_data(item), {"a dict": "that has strings"}) self.assertEquals(self.mock_log.error.call_count, 0) self.assertEquals(self.mock_log.debug.call_count, 0)
def test_missing_event_data(self): item = {"something else": "not an event"} self.assertIsNone(eventlog.get_event_data(item)) self.assertEquals(self.mock_log.error.call_count, 1) self.assertIn('missing event value', self.mock_log.error.call_args[0][0])
def test_get_bad_string_event_data(self): item = {"event": "a string but not JSON"} self.assertIsNone(eventlog.get_event_data(item)) self.assertEquals(self.mock_log.error.call_count, 1) self.assertIn('unparsable event value', self.mock_log.error.call_args[0][0])
def test_get_empty_string_event_data(self): item = {"event": ''} self.assertEquals(eventlog.get_event_data(item), {}) self.assertEquals(self.mock_log.error.call_count, 0) self.assertEquals(self.mock_log.debug.call_count, 0)
def test_get_key_value_string_event_data(self): item = {"event": "a=3,b=4"} self.assertIsNone(eventlog.get_event_data(item)) self.assertEquals(self.mock_log.debug.call_count, 1) self.assertIn('key-value pairs', self.mock_log.debug.call_args[0][0])
def test_get_bad_string_event_data(self): item = {"event": "a string but not JSON"} self.assertIsNone(eventlog.get_event_data(item))