def mapper(self, line): event = eventlog.parse_json_event(line) if event is None: return username = event.get('username') if not username: return stripped_username = username.strip() if username != stripped_username: log.error("User '%s' has extra whitespace, which is being stripped. Event: %s", username, event) username = stripped_username timestamp_as_datetime = eventlog.get_event_time(event) if timestamp_as_datetime is None: return if timestamp_as_datetime >= self.end_datetime: return timestamp = eventlog.datetime_to_timestamp(timestamp_as_datetime) ip_address = event.get('ip') if not ip_address: log.warning("No ip_address found for user '%s' on '%s'.", username, timestamp) return yield username, (timestamp, ip_address)
def get_raw_event(self, event_line): event = eventlog.parse_json_event(event_line) event_data = eventlog.get_event_data(event) if event_data is not None: event['event'] = event_data dump = json.dumps(event, sort_keys=True) encoded_dump = backslash_encode_value(dump) return encoded_dump
def mapper(self, line): event = eventlog.parse_json_event(line) date_string = event['time'].split("T")[0] filtered_event = self._filter_event(event) if filtered_event is None: return yield date_string.encode('utf-8'), line.rstrip('\r\n')
def get_raw_events_from_log_file(self, input_file): # override parent class to disable event filter raw_events = [] for line in input_file: event_row = eventlog.parse_json_event(line) if not event_row: continue timestamp = eventlog.get_event_time(event_row) if not timestamp: continue event_row['timestamp'] = timestamp raw_events.append(event_row) return raw_events
def mapper(self, line): event = eventlog.parse_json_event(line) date_string = event['time'].split("T")[0] filtered_event = self.filter_event(event) if filtered_event is None: return deidentified_event = self.deidentify_event(filtered_event) if deidentified_event is None: return yield date_string.encode('utf-8'), cjson.encode(deidentified_event)
def get_event_and_date_string(self, line): """Default mapper implementation, that always outputs the log line, but with a configurable key.""" event = eventlog.parse_json_event(line) if event is None: return None event_time = self.get_event_time(event) if not event_time: return None # Don't use strptime to parse the date, it is extremely slow # to do so. Instead rely on alphanumeric comparisons. The # timestamp is ISO8601 formatted, so dates will look like # %Y-%m-%d. For example: 2014-05-20. date_string = event_time.split("T")[0] if date_string < self.lower_bound_date_string or date_string >= self.upper_bound_date_string: return None return event, date_string
def get_explicit_enrollment_output(line): """ Generates output values for explicit enrollment events. Args: line: text line from a tracking event log. Returns: (course_id, user_id), (timestamp, action_value) where action_value = 1 (enrolled) or -1 (unenrolled) and timestamp is in ISO format, with resolution to the millisecond. or None if there is no valid enrollment event on the line. Example: (edX/DemoX/Demo_Course, dummy_userid), (2013-09-10T00:01:05.123456, 1) """ # Before parsing, check that the line contains something that # suggests it's an enrollment event. if 'edx.course.enrollment' not in line: return None # try to parse the line into a dict: event = eventlog.parse_json_event(line) if event is None: # The line didn't parse. For this specific purpose, # we can assume that all enrollment-related lines would parse, # and these non-parsing lines would get skipped anyway. return None # get event type, and check that it exists: event_type = event.get('event_type') if event_type is None: log.error("encountered event with no event_type: %s", event) return None # convert the type to a value: if event_type == 'edx.course.enrollment.activated': action_value = ENROLLED elif event_type == 'edx.course.enrollment.deactivated': action_value = UNENROLLED else: # not an enrollment event... return None # get the timestamp: datetime = eventlog.get_event_time(event) if datetime is None: log.error("encountered event with bad datetime: %s", event) return None timestamp = eventlog.datetime_to_timestamp(datetime) # Use the `user_id` from the event `data` field, since the # `user_id` in the `context` field is the user who made the # request but not necessarily the one who got enrolled. (The # `course_id` should be the same in `context` as in `data`.) # Get the event data: event_data = eventlog.get_event_data(event) if event_data is None: # Assume it's already logged (and with more specifics). return None # Get the course_id from the data, and validate. course_id = event_data['course_id'] if not opaque_key_util.is_valid_course_id(course_id): log.error("encountered explicit enrollment event with bogus course_id: %s", event) return None # Get the user_id from the data: user_id = event_data.get('user_id') if user_id is None: log.error("encountered explicit enrollment event with no user_id: %s", event) return None # For now, ignore the enrollment 'mode' (e.g. 'honor'). return (course_id, user_id), (timestamp, action_value)
def test_parse_json_event_with_nonascii(self): line = '{"username": "******"}' result = eventlog.parse_json_event(line) self.assertTrue(isinstance(result, dict)) self.assertEquals(result['username'], u'b\ufffdb')
def test_parse_json_event_with_cruft(self): line = 'leading cruft here {"username": "******"} ' result = eventlog.parse_json_event(line) self.assertTrue(isinstance(result, dict))
def test_parse_json_event_truncated(self): line = '{"username": "unsuccessful' result = eventlog.parse_json_event(line) self.assertIsNone(result)
def test_parse_valid_json_event(self): line = '{"username": "******"}' result = eventlog.parse_json_event(line) self.assertTrue(isinstance(result, dict))
def obfuscate_event_entry(self, line): event = eventlog.parse_json_event(line) if event is None: # Unexpected here... log.error(u"Encountered event entry which failed to parse: %r", line) return line course_id = eventlog.get_course_id(event, from_url=True) if course_id is None: # Unexpected here... log.error(u"Encountered event entry with no course_id: %r", line) return line # We cannot use this method as-is, since we need to know what was done to the event, so # that it can be transformed back to its original form once cleaned. # NOT event_data = eventlog.get_event_data(event) event_json_decoded = False event_data = event.get('event') if event_data is None: log.error(u"Encountered event entry with no 'event' payload: %r", line) if event_data == '': # Note that this happens with some browser events. Instead of # failing to parse it as a JSON string, just leave as-is. pass elif isinstance(event_data, basestring): # Cjson produces str, while json produces unicode. Hmm. if len(event_data) == 512 and 'POST' in event_data: # It's a truncated JSON string. But we're going to throw it out anyway, so no worries. pass elif '{' not in event_data and '=' in event_data: # It's a key-value pair from a browser event. Just process as-is, rather than parsing and reassembling. pass else: try: event_data = eventlog.decode_json(event_data) event_json_decoded = True except Exception: log.error(u"Encountered event entry with unparseable 'event' payload: %r", line) # TODO: update the comment! This is where we traverse the event in search of values that should be "cleansed". # Much along the model of what we already do for 'state' in CWSM. Except that we need to be more # flexible in determining the level of backslash encoding -- decode and re-encode as many levels as needed # to get to strings that can be properly interpreted. event_user_info = self.get_userinfo_from_event(event, event_data) if 'POST' in event_data: if self.parameters['skip_post']: return None updated_event_data = self.obfuscator.obfuscate_structure(event_data, u"event", event_user_info) if updated_event_data is not None: event_source = event.get('event_source') event_type = event.get('event_type') log.info(u"Obfuscated %s event with event_type = '%s'", event_source, event_type) if event_json_decoded: # TODO: should really use cjson, if that were originally used for decoding the json. updated_event_data = json.dumps(updated_event_data) event['event'] = updated_event_data # TODO: should really use cjson, if that were originally used for decoding the json. return json.dumps(event)
def parse_event_from_entity(self, line): return eventlog.parse_json_event(line)
def get_raw_event(self, event_line): event = eventlog.parse_json_event(event_line) dump = json.dumps(event, sort_keys=True) encoded_dump = backslash_encode_value(dump) return encoded_dump
def obfuscate_event_line(self, line): """Parse an event line, obfuscate it, and convert back to a line.""" input_event = eventlog.parse_json_event(line) obfuscated_event = self._obfuscate_event(input_event) return eventlog.encode_json(obfuscated_event).strip()
def obfuscate_event_entry(self, line): event = eventlog.parse_json_event(line) if event is None: # Unexpected here... log.error(u"Encountered event entry which failed to parse: %r", line) return line course_id = eventlog.get_course_id(event, from_url=True) if course_id is None: # Unexpected here... log.error(u"Encountered event entry with no course_id: %r", line) return line # We cannot use this method as-is, since we need to know what was done to the event, so # that it can be transformed back to its original form once cleaned. # NOT event_data = eventlog.get_event_data(event) event_json_decoded = False event_data = event.get('event') if event_data is None: log.error(u"Encountered event entry with no 'event' payload: %r", line) if event_data == '': # Note that this happens with some browser events. Instead of # failing to parse it as a JSON string, just leave as-is. pass elif isinstance(event_data, basestring): # Cjson produces str, while json produces unicode. Hmm. if len(event_data) == 512 and 'POST' in event_data: # It's a truncated JSON string. But we're going to throw it out anyway, so no worries. pass elif '{' not in event_data and '=' in event_data: # It's a key-value pair from a browser event. Just process as-is, rather than parsing and reassembling. pass else: try: event_data = eventlog.decode_json(event_data) event_json_decoded = True except Exception: log.error( u"Encountered event entry with unparseable 'event' payload: %r", line) # TODO: update the comment! This is where we traverse the event in search of values that should be "cleansed". # Much along the model of what we already do for 'state' in CWSM. Except that we need to be more # flexible in determining the level of backslash encoding -- decode and re-encode as many levels as needed # to get to strings that can be properly interpreted. event_user_info = self.get_userinfo_from_event(event, event_data) if 'POST' in event_data: if self.parameters['skip_post']: return None updated_event_data = self.obfuscator.obfuscate_structure( event_data, u"event", event_user_info) if updated_event_data is not None: event_source = event.get('event_source') event_type = event.get('event_type') log.info(u"Obfuscated %s event with event_type = '%s'", event_source, event_type) if event_json_decoded: # TODO: should really use cjson, if that were originally used for decoding the json. updated_event_data = json.dumps(updated_event_data) event['event'] = updated_event_data # TODO: should really use cjson, if that were originally used for decoding the json. return json.dumps(event)
def get_explicit_enrollment_output(line): """ Generates output values for explicit enrollment events. Args: line: text line from a tracking event log. Returns: (course_id, user_id), (timestamp, action_value) where action_value = 1 (enrolled) or -1 (unenrolled) and timestamp is in ISO format, with resolution to the millisecond. or None if there is no valid enrollment event on the line. Example: (edX/DemoX/Demo_Course, dummy_userid), (2013-09-10T00:01:05.123456, 1) """ # Before parsing, check that the line contains something that # suggests it's an enrollment event. if 'edx.course.enrollment' not in line: return None # try to parse the line into a dict: event = eventlog.parse_json_event(line) if event is None: # The line didn't parse. For this specific purpose, # we can assume that all enrollment-related lines would parse, # and these non-parsing lines would get skipped anyway. return None # get event type, and check that it exists: event_type = event.get('event_type') if event_type is None: log.error("encountered event with no event_type: %s", event) return None # convert the type to a value: if event_type == 'edx.course.enrollment.activated': action_value = ENROLLED elif event_type == 'edx.course.enrollment.deactivated': action_value = UNENROLLED else: # not an enrollment event... return None # get the timestamp: datetime = eventlog.get_event_time(event) if datetime is None: log.error("encountered event with bad datetime: %s", event) return None timestamp = eventlog.datetime_to_timestamp(datetime) # Use the `user_id` from the event `data` field, since the # `user_id` in the `context` field is the user who made the # request but not necessarily the one who got enrolled. (The # `course_id` should be the same in `context` as in `data`.) # Get the event data: event_data = eventlog.get_event_data(event) if event_data is None: # Assume it's already logged (and with more specifics). return None # Get the course_id from the data, and validate. course_id = event_data['course_id'] if not opaque_key_util.is_valid_course_id(course_id): log.error( "encountered explicit enrollment event with bogus course_id: %s", event) return None # Get the user_id from the data: user_id = event_data.get('user_id') if user_id is None: log.error("encountered explicit enrollment event with no user_id: %s", event) return None # For now, ignore the enrollment 'mode' (e.g. 'honor'). return (course_id, user_id), (timestamp, action_value)