def mapper(self, line): value = self.get_event_and_date_string(line) if value is None: return event, date_string = value username = eventlog.get_event_username(event) if not username: return # Get timestamp instead of date string, so we get the latest ip # address for events on the same day. timestamp = eventlog.get_event_time_string(event) if not timestamp: return ip_address = event.get('ip') if not ip_address: log.warning("No ip_address found for user '%s' on '%s'.", username, timestamp) return # Get the course_id from context, if it happens to be present. # It's okay if it isn't. # (Not sure if there are particular types of course # interaction we care about, but we might want to only collect # the course_id off of explicit events, and ignore implicit # events as not being "real" interactions with course content. # Or maybe we add a flag indicating explicit vs. implicit, so # that this can be better teased apart. For example, we could # use the latest explicit event for a course, but if there are # none, then use the latest implicit event for the course, and # if there are none, then use the latest overall event.) course_id = eventlog.get_course_id(event) # For multi-output, we will generate a single file for each key value. # When looking at location for user in a course, we don't want to have # an output file per course per date, so just use date as the key, # and have a single file representing all events on the date. yield date_string, (timestamp, ip_address, course_id, username)
def mapper(self, line): value = self.get_event_and_date_string(line) if value is None: return event, date_string = value username = eventlog.get_event_username(event) if not username: log.error("Encountered event with no username: %s", event) self.incr_counter('Active Users last year', 'Discard Event Missing username', 1) return date = datetime.date(*[int(x) for x in date_string.split('-')]) iso_year, iso_weekofyear, _iso_weekday = date.isocalendar() week = isoweek.Week(iso_year, iso_weekofyear) start_date = week.monday().isoformat() end_date = (week.sunday() + datetime.timedelta(1)).isoformat() yield (start_date, end_date, username), 1
def test_event_username_with_trailing_whitespace(self): item = {"username": "******"} self.assertEquals(eventlog.get_event_username(item), u'bub')
def test_empty_event_username(self): item = {"username": "******"} self.assertIsNone(eventlog.get_event_username(item))
def test_missing_event_username(self): item = {"something else": "not an event"} self.assertIsNone(eventlog.get_event_username(item))
def _remap_user_info_in_event(self, event, event_data): """ Harvest user info from event, and remap those values (in place) where appropriate. Returns a dict of iterables, with key values of 'username', 'user_id', and 'name'. """ # Find user info, and debug_str = self._get_log_string_for_event(event) # Create a user_info structure to collect relevant user information to look # for elsewhere in the event. We need to return a dictionary of iterables, # but since we will potentially be adding the same values repeatedly from # different parts of the event, a set will make sure these are deduped. user_info = defaultdict(set) # Note that eventlog.get_event_username() does a strip on the username and checks for zero-len, # so we don't have to do so here. username = eventlog.get_event_username(event) if username is not None: username = username.decode('utf8') remapped_username = self._remap_username(username, user_info) if remapped_username is not None: event['username'] = remapped_username else: log.error( "Redacting unrecognized username for '%s' field: '%s' %s", 'username', username, debug_str) event['username'] = REDACTED_USERNAME # Get the user_id from context, either as an int or None, and remap. user_id = self._get_user_id_as_int( event.get('context', {}).get('user_id')) if user_id is not None: user_info['user_id'].add(user_id) info = self._get_user_info_for_user_id(user_id) if info is not None: for key, value in info.iteritems(): user_info[key].add(value) if username is not None and 'username' in info and username != info[ 'username']: log.error( u"user_id ('%s'=>'%s') does not match username ('%s') %s", user_id, info['username'], username, debug_str, ) event['context']['user_id'] = self.remap_id(user_id) # Clean username from context. if 'context' in event: # Remap value of username in context, if it is present. (Removed in more recent events.) if 'username' in event['context'] and len( event['context']['username'].strip()) > 0: context_username = event['context']['username'].strip().decode( 'utf8') remapped_username = self._remap_username( context_username, user_info) if remapped_username is not None: event['context']['username'] = remapped_username else: log.error( "Redacting unrecognized username for '%s' field: '%s' %s", 'context.username', context_username, debug_str) event['context']['username'] = REDACTED_USERNAME # Look into the event payload. if event_data: # Get the user_id from payload and remap. event_user_id = self._get_user_id_as_int(event_data.get('user_id')) if event_user_id is not None: user_info['user_id'].add(event_user_id) info = self._get_user_info_for_user_id(event_user_id) if info is not None: for key, value in info.iteritems(): user_info[key].add(value) event_data['user_id'] = self.remap_id(event_user_id) # Remap values of usernames in payload, if present. Usernames may appear with different key values. # TODO: confirm that these values are usernames, not user_id values. (User_id values will fail remapping.) for username_key in ['username', 'instructor', 'student', 'user']: if username_key in event_data and len( event_data[username_key].strip()) > 0: event_username = event_data[username_key].strip().decode( 'utf8') remapped_username = self._remap_username( event_username, user_info) if remapped_username is not None: event_data[username_key] = remapped_username else: log.error( "Redacting unrecognized username for 'event.%s' field: '%s' %s", username_key, event_username, debug_str) event_data[username_key] = REDACTED_USERNAME # Finally return the fully-constructed dict. return user_info
def get_userinfo_from_event(self, event, event_data): # Start simply, and just get obvious info. See what it matches. # Need to check back on this, but we really only need to know # if this information is wrong. What we want to come out # of this is a user_id and/or a username that can be used for # cleaning the rest of the event. # And actually, what we also need is the relevant fullname to use, # so we need to pick entries out of the user_info that match. # One or more? No analysis was really made of alignment. # So we'll have to do it here... event_type = event.get('event_type') if isinstance(event_type, str): event_type = event_type.decode('utf8') debug_str = u" [event_type='{}']".format(event_type) username_entry = None username = eventlog.get_event_username(event) if username is not None: username = username.decode('utf8') if self.user_info is not None: username_entry = self.user_info.get(username) if username_entry is None: log.error(u"username ('%s') is unknown to user_info %s", username, debug_str) # Get the user_id either as an int or None userid_entry = None user_id = self.get_user_id_as_int( event.get('context', {}).get('user_id')) if user_id is not None: if self.user_info is not None: userid_entry = self.user_info.get(user_id) if userid_entry is None: log.error(u"user_id ('%s') is unknown to user_info %s", user_id, debug_str) elif username_entry and userid_entry != username_entry: log.error( u"user_id ('%s'='%s') does not match username ('%s'='%s') %s", userid_entry.get('user_id'), userid_entry.get('username'), username_entry.get('username'), username_entry.get('user_id'), debug_str, ) event_userid_entry = None if event_data and isinstance(event_data, dict): event_user_id = self.get_user_id_as_int(event_data.get('user_id')) if event_user_id: if self.user_info is not None: event_userid_entry = self.user_info.get(event_user_id) if event_userid_entry is None: log.error( u"Event_user_id ('%s') is unknown to user_info %s", event_user_id, debug_str) if user_id is None: # This is way too common. In testing, every edx.course.enrollment.xxx had the user_id in the event but not # in context. Weird. # log.warning(u"Found user_id ('%s') in event but nothing in context %s", event_user_id, debug_str) pass elif event_userid_entry and userid_entry != event_userid_entry: # This turns out to be somewhat expected for certain event types where one user is doing something on behalf # of another user. The actor is in context, and the object is in event payload. if event_type not in EVENT_TYPES_WITH_DIFFERENT_USERIDS: log.error( u"Context user_id ('%s'='%s') does not match event user_id ('%s'='%s') %s", userid_entry.get('user_id'), userid_entry.get('username'), event_userid_entry.get('username'), event_userid_entry.get('user_id'), debug_str, ) elif event_user_id != user_id: log.error( u"Found user_id ('%s') in event that was different from context ('%s') %s", event_user_id, user_id, debug_str) # We choose the event user_id over the context, and fall back on the username. if event_userid_entry is not None: return event_userid_entry elif userid_entry is not None: return userid_entry else: return username_entry
def _remap_user_info_in_event(self, event, event_data): """ Harvest user info from event, and remap those values (in place) where appropriate. Returns a dict of iterables, with key values of 'username', 'user_id', and 'name'. """ # Find user info, and debug_str = self._get_log_string_for_event(event) # Create a user_info structure to collect relevant user information to look # for elsewhere in the event. We need to return a dictionary of iterables, # but since we will potentially be adding the same values repeatedly from # different parts of the event, a set will make sure these are deduped. user_info = defaultdict(set) # Note that eventlog.get_event_username() does a strip on the username and checks for zero-len, # so we don't have to do so here. username = eventlog.get_event_username(event) if username is not None: username = username.decode('utf8') remapped_username = self._remap_username(username, user_info) if remapped_username is not None: event['username'] = remapped_username else: log.error("Redacting unrecognized username for '%s' field: '%s' %s", 'username', username, debug_str) event['username'] = REDACTED_USERNAME # Get the user_id from context, either as an int or None, and remap. user_id = self._get_user_id_as_int(event.get('context', {}).get('user_id')) if user_id is not None: user_info['user_id'].add(user_id) info = self._get_user_info_for_user_id(user_id) if info is not None: for key, value in info.iteritems(): user_info[key].add(value) if username is not None and 'username' in info and username != info['username']: log.error( u"user_id ('%s'=>'%s') does not match username ('%s') %s", user_id, info['username'], username, debug_str, ) event['context']['user_id'] = self.remap_id(user_id) # Clean username from context. if 'context' in event: # Remap value of username in context, if it is present. (Removed in more recent events.) if 'username' in event['context'] and len(event['context']['username'].strip()) > 0: context_username = event['context']['username'].strip().decode('utf8') remapped_username = self._remap_username(context_username, user_info) if remapped_username is not None: event['context']['username'] = remapped_username else: log.error("Redacting unrecognized username for '%s' field: '%s' %s", 'context.username', context_username, debug_str) event['context']['username'] = REDACTED_USERNAME # Look into the event payload. if event_data: # Get the user_id from payload and remap. event_user_id = self._get_user_id_as_int(event_data.get('user_id')) if event_user_id is not None: user_info['user_id'].add(event_user_id) info = self._get_user_info_for_user_id(event_user_id) if info is not None: for key, value in info.iteritems(): user_info[key].add(value) event_data['user_id'] = self.remap_id(event_user_id) # Remap values of usernames in payload, if present. Usernames may appear with different key values. # TODO: confirm that these values are usernames, not user_id values. (User_id values will fail remapping.) for username_key in ['username', 'instructor', 'student', 'user']: if username_key in event_data and len(event_data[username_key].strip()) > 0: event_username = event_data[username_key].strip().decode('utf8') remapped_username = self._remap_username(event_username, user_info) if remapped_username is not None: event_data[username_key] = remapped_username else: log.error("Redacting unrecognized username for 'event.%s' field: '%s' %s", username_key, event_username, debug_str) event_data[username_key] = REDACTED_USERNAME # Finally return the fully-constructed dict. return user_info
def get_userinfo_from_event(self, event, event_data): # Start simply, and just get obvious info. See what it matches. # Need to check back on this, but we really only need to know # if this information is wrong. What we want to come out # of this is a user_id and/or a username that can be used for # cleaning the rest of the event. # And actually, what we also need is the relevant fullname to use, # so we need to pick entries out of the user_info that match. # One or more? No analysis was really made of alignment. # So we'll have to do it here... event_type = event.get('event_type') if isinstance(event_type, str): event_type = event_type.decode('utf8') debug_str = u" [event_type='{}']".format(event_type) username_entry = None username = eventlog.get_event_username(event) if username is not None: username = username.decode('utf8') if self.user_info is not None: username_entry = self.user_info.get(username) if username_entry is None: log.error(u"username ('%s') is unknown to user_info %s", username, debug_str) # Get the user_id either as an int or None userid_entry = None user_id = self.get_user_id_as_int(event.get('context', {}).get('user_id')) if user_id is not None: if self.user_info is not None: userid_entry = self.user_info.get(user_id) if userid_entry is None: log.error(u"user_id ('%s') is unknown to user_info %s", user_id, debug_str) elif username_entry and userid_entry != username_entry: log.error( u"user_id ('%s'='%s') does not match username ('%s'='%s') %s", userid_entry.get('user_id'), userid_entry.get('username'), username_entry.get('username'), username_entry.get('user_id'), debug_str, ) event_userid_entry = None if event_data and isinstance(event_data, dict): event_user_id = self.get_user_id_as_int(event_data.get('user_id')) if event_user_id: if self.user_info is not None: event_userid_entry = self.user_info.get(event_user_id) if event_userid_entry is None: log.error(u"Event_user_id ('%s') is unknown to user_info %s", event_user_id, debug_str) if user_id is None: # This is way too common. In testing, every edx.course.enrollment.xxx had the user_id in the event but not # in context. Weird. # log.warning(u"Found user_id ('%s') in event but nothing in context %s", event_user_id, debug_str) pass elif event_userid_entry and userid_entry != event_userid_entry: # This turns out to be somewhat expected for certain event types where one user is doing something on behalf # of another user. The actor is in context, and the object is in event payload. if event_type not in EVENT_TYPES_WITH_DIFFERENT_USERIDS: log.error( u"Context user_id ('%s'='%s') does not match event user_id ('%s'='%s') %s", userid_entry.get('user_id'), userid_entry.get('username'), event_userid_entry.get('username'), event_userid_entry.get('user_id'), debug_str, ) elif event_user_id != user_id: log.error(u"Found user_id ('%s') in event that was different from context ('%s') %s", event_user_id, user_id, debug_str) # We choose the event user_id over the context, and fall back on the username. if event_userid_entry is not None: return event_userid_entry elif userid_entry is not None: return userid_entry else: return username_entry