def obfuscate_wiki_entry(self, line, user_profile): fields = line.rstrip('\r\n').decode('utf8').split('\t') record = ArticleRevisionRecord(*fields) user_info = {} if user_profile is not None: user_id = record.user_id if user_id != 'NULL': profile_entry = user_profile.get(user_id) if profile_entry is None: log.error("Missing profile entry for user_id %s", user_id) else: user_info['name'] = [ profile_entry.name, ] if record.ip_address != 'NULL' and record.ip_address != 'ip_address': log.warning("Found non-NULL IP address") if record.automatic_log != '' and record.automatic_log != 'automatic_log': log.warning(u"Found non-zero-length automatic_log: %s", record.automatic_log) # Can't reset values, so update original fields. fields[12] = backslash_encode_value( self.obfuscator.obfuscate_text( backslash_decode_value(record.content), user_info)) fields[2] = backslash_encode_value( self.obfuscator.obfuscate_text( backslash_decode_value(record.user_message), user_info)) return u"\t".join(fields).encode('utf-8')
def filter_row(self, row): user_id = row[5] user_info = {} if user_id != 'NULL': user_id = int(user_id) user_info['user_id'] = [user_id, ] try: entry = self.user_by_id[user_id] if 'username' in entry: user_info['username'] = [entry['username'], ] if 'name' in entry: user_info['name'] = [entry['name'], ] except KeyError: log.error("Unable to find wiki user_id: %s in the user_by_id map", user_id) row[2] = '' # user_message row[3] = '' # automatic_log row[4] = '' # ip_address # For user_id, preserve 'NULL' value if present. if user_id != 'NULL': row[5] = self.remap_id(user_id) wiki_content = backslash_decode_value(row[12].decode('utf8')) cleaned_content = self.obfuscator.obfuscate_text(wiki_content, user_info) row[12] = backslash_encode_value(cleaned_content).encode('utf8') return row
def filter_row(self, row): user_id = row[5] user_info = {} if user_id != 'NULL': user_id = int(user_id) user_info['user_id'] = [user_id, ] try: entry = self.user_by_id[user_id] if 'username' in entry: user_info['username'] = [entry['username'], ] if 'name' in entry: user_info['name'] = [entry['name'], ] except KeyError: log.error("Unable to find wiki user_id: %s in the user_by_id map", user_id) row[2] = '' # user_message row[3] = '' # automatic_log row[4] = '' # ip_address # For user_id, preserve 'NULL' value if present. if user_id != 'NULL': row[5] = self.remap_id(user_id) wiki_content = backslash_decode_value(row[12].decode('utf8')) cleaned_content = self.obfuscator.obfuscate_text(wiki_content, user_info) row[12] = backslash_encode_value(cleaned_content).encode('utf8') return row
def get_raw_event(self, event_line): event = eventlog.parse_json_event(event_line) event_data = eventlog.get_event_data(event) if event_data is not None: event['event'] = event_data dump = json.dumps(event, sort_keys=True) encoded_dump = backslash_encode_value(dump) return encoded_dump
def get_raw_event(self, event_line): event = eventlog.parse_json_event(event_line) event_data = eventlog.get_event_data(event) if event_data is not None: event['event'] = event_data dump = json.dumps(event, sort_keys=True) encoded_dump = backslash_encode_value(dump) return encoded_dump
def obfuscate_wiki_entry(self, line, user_profile): fields = line.rstrip('\r\n').decode('utf8').split('\t') record = ArticleRevisionRecord(*fields) user_info = {} if user_profile is not None: user_id = record.user_id if user_id != 'NULL': profile_entry = user_profile.get(user_id) if profile_entry is None: log.error("Missing profile entry for user_id %s", user_id) else: user_info['name'] = [profile_entry.name, ] if record.ip_address != 'NULL' and record.ip_address != 'ip_address': log.warning("Found non-NULL IP address") if record.automatic_log != '' and record.automatic_log != 'automatic_log': log.warning(u"Found non-zero-length automatic_log: %s", record.automatic_log) # Can't reset values, so update original fields. fields[12] = backslash_encode_value(self.obfuscator.obfuscate_text(backslash_decode_value(record.content), user_info)) fields[2] = backslash_encode_value(self.obfuscator.obfuscate_text(backslash_decode_value(record.user_message), user_info)) return u"\t".join(fields).encode('utf-8')
def mapper(self, line): value = self.get_event_and_date_string(line) if value is None: return event, event_date = value event_type = event.get('event_type') event_source = event.get('event_source') exported = False if event_source is None or event_type is None or event_date is None: # Ignore if any of the keys is None return if event_type.startswith('/'): # Ignore events that begin with a slash return if (event_source, event_type) in self.known_events: event_category = self.known_events[(event_source, event_type)] exported = True else: event_category = 'unknown' # Make sure that event_type doesn't have embedded newlines and such, but do so # after checking that it's not None. event_type = backslash_encode_value(unicode(event_type)) yield (event_date, event_category, event_type, event_source, exported), 1
def mapper(self, line): value = self.get_event_and_date_string(line) if value is None: return event, event_date = value event_type = event.get('event_type') event_source = event.get('event_source') exported = False if event_source is None or event_type is None or event_date is None: # Ignore if any of the keys is None return if event_type.startswith('/'): # Ignore events that begin with a slash return if (event_source, event_type) in self.known_events: event_category = self.known_events[(event_source, event_type)] exported = True else: event_category = 'unknown' # Make sure that event_type doesn't have embedded newlines and such, but do so # after checking that it's not None. event_type = backslash_encode_value(unicode(event_type)) yield (event_date, event_category, event_type, event_source, exported), 1
def get_raw_event(self, event_line): event = eventlog.parse_json_event(event_line) dump = json.dumps(event, sort_keys=True) encoded_dump = backslash_encode_value(dump) return encoded_dump
def test_encoding(self, text, expected_result): self.assertEquals(obfuscate_util.backslash_encode_value(text), expected_result)
def test_encoding_round_trip(self, text): self.assertEquals( text, obfuscate_util.backslash_decode_value( obfuscate_util.backslash_encode_value(text)))
def _add_entry(self, record_dict, record_key, record_field, label, obj): """ Add the `obj` to the `record_key` entry of `record_dict`, performing appropriate conversion based on `record_field`. For strings, the entry is truncated, if necessary, so we should rarely see truncation errors. Also, null characters are escaped so that they won't fail when being loaded into BigQuery. For timestamps, parsing is done using ciso8601. Errors are logged, but are not fatal. In such cases, the value is simply not set. (It's only fatal, then, if the value was required.) """ if isinstance(record_field, StringField): if obj is None: # TODO: this should really check to see if the record_field is nullable. value = None else: value = backslash_encode_value(unicode(obj)) if '\x00' in value: value = value.replace('\x00', '\\0') # Avoid validation errors later due to length by truncating here. field_length = record_field.length value_length = len(value) # TODO: This implies that field_length is at least 4. if value_length > field_length: log.error( "Record value length (%d) exceeds max length (%d) for field %s: %r", value_length, field_length, record_key, value) value = u"{}...".format(value[:field_length - 4]) record_dict[record_key] = value elif isinstance(record_field, IntegerField): try: record_dict[record_key] = int(obj) except ValueError: log.error('Unable to cast value to int for %s: %r', label, obj) elif isinstance(record_field, BooleanField): try: record_dict[record_key] = bool(obj) except ValueError: log.error('Unable to cast value to bool for %s: %r', label, obj) elif isinstance(record_field, FloatField): try: record_dict[record_key] = float(obj) except ValueError: log.error('Unable to cast value to float for %s: %r', label, obj) elif isinstance(record_field, DateTimeField): datetime_obj = None try: if obj is not None: datetime_obj = ciso8601.parse_datetime(obj) if datetime_obj.tzinfo: datetime_obj = datetime_obj.astimezone(pytz.utc) else: datetime_obj = obj except ValueError: log.error('Unable to cast value to datetime for %s: %r', label, obj) # Because it's not enough just to create a datetime object, also perform # validation here. if datetime_obj is not None: validation_errors = self.date_time_field_for_validating.validate( datetime_obj) if len(validation_errors) > 0: log.error( 'Invalid assigment of value %r to field "%s": %s', datetime_obj, label, ', '.join(validation_errors)) datetime_obj = None record_dict[record_key] = datetime_obj else: record_dict[record_key] = obj
def test_encoding(self, text, expected_result): self.assertEquals(obfuscate_util.backslash_encode_value(text), expected_result)
def test_encoding_round_trip(self, text): self.assertEquals(text, obfuscate_util.backslash_decode_value(obfuscate_util.backslash_encode_value(text)))
def get_raw_event(self, event_line): event = eventlog.parse_json_event(event_line) dump = json.dumps(event, sort_keys=True) encoded_dump = backslash_encode_value(dump) return encoded_dump