def handle(self, raw_file): self.log("Reading " + raw_file) try: record_count = 0 bytes_read = 0 start = now() file_version = fileutil.detect_file_version(raw_file, simple_detection=True) self.log("Detected version {0} for file {1}".format(file_version, raw_file)) for unpacked in fileutil.unpack(raw_file, file_version=file_version): record_count += 1 common_bytes = unpacked.len_path + fileutil.RECORD_PREAMBLE_LENGTH[file_version] current_bytes = common_bytes + unpacked.len_data current_bytes_uncompressed = common_bytes + len(unpacked.data) bytes_read += current_bytes if unpacked.error: self.log("ERROR: Found corrupted data for record {0} in " \ "{1} path: {2} Error: {3}".format(record_count, raw_file, unpacked.path, unpacked.error)) self.stats.increment(records_read=1, bytes_read=current_bytes, bytes_uncompressed=current_bytes_uncompressed, bad_records=1, bad_record_type="corrupted_data") continue if len(unpacked.data) == 0: self.log("WARN: Found empty data for record {0} in " \ "{2} path: {2}".format(record_count, raw_file, unpacked.path)) self.stats.increment(records_read=1, bytes_read=current_bytes, bytes_uncompressed=current_bytes_uncompressed, bad_records=1, bad_record_type="empty_data") continue submission_date = ts_to_yyyymmdd(unpacked.timestamp) path = fileutil.to_unicode(unpacked.path) if unpacked.data[0] != "{": # Data looks weird, should be JSON. self.log("Warning: Found unexpected data for record {0}" \ " in {1} path: {2} data:\n{3}".format(record_count, raw_file, path, unpacked.data)) else: # Raw JSON, make sure we treat it as unicode. unpacked.data = fileutil.to_unicode(unpacked.data) path_components = path.split("/") if len(path_components) != self.expected_dim_count: # We're going to pop the ID off, but we'll also add the # submission date, so it evens out. bad_record_type = "invalid_path" if ReadRawStep.UUID_ONLY_PATH.match(path): bad_record_type = "uuid_only_path" else: self.log("Found an invalid path in record {0}: " \ "{1}".format(record_count, path)) self.stats.increment(records_read=1, bytes_read=current_bytes, bytes_uncompressed=current_bytes_uncompressed, bad_records=1, bad_record_type=bad_record_type) continue key = path_components.pop(0) info = {} info["reason"] = path_components.pop(0) info["appName"] = path_components.pop(0) info["appVersion"] = path_components.pop(0) info["appUpdateChannel"] = path_components.pop(0) info["appBuildID"] = path_components.pop(0) dims = self.schema.dimensions_from(info, submission_date) channel = self.schema.get_field(dims, "appUpdateChannel", True, True) self.stats.increment(channel=channel, records_read=1, bytes_read=current_bytes, bytes_uncompressed=current_bytes_uncompressed) try: # Convert data: if self.converter is None: serialized_data = unpacked.data # TODO: Converter.VERSION_UNCONVERTED data_version = 1 else: parsed_data, parsed_dims = self.converter.convert_json( unpacked.data, dims[-1], unpacked.ip) # TODO: take this out if it's too slow for i in range(len(dims)): if dims[i] != parsed_dims[i]: self.log("Record {0} mismatched dimension " \ "{1}: '{2}' != '{3}'".format( record_count, i, dims[i], parsed_dims[i])) serialized_data = self.converter.serialize(parsed_data) dims = parsed_dims # TODO: Converter.VERSION_CONVERTED data_version = 2 try: # Write to persistent storage n = self.storage.write(key, serialized_data, dims, data_version) self.stats.increment(channel=channel, records_written=1, bytes_written=len(key) + len(serialized_data) + 2) # Compress rotated files as we generate them if n.endswith(StorageLayout.PENDING_COMPRESSION_SUFFIX): self.q_out.put(n) except Exception, e: self.write_bad_record(key, dims, serialized_data, str(e), "ERROR Writing to output file:", "write_failed") except BadPayloadError, e: self.write_bad_record(key, dims, unpacked.data, e.msg, "Bad Payload:", "bad_payload") except Exception, e: err_message = str(e) if err_message == "Missing in payload: info.revision": # We don't need to write these bad records out - we know # why they are being skipped. self.stats.increment(channel=channel, bad_records=1, bad_record_type="missing_revision") elif err_message == "Invalid revision URL: /rev/": # We do want to log these payloads, but we don't want # the full stack trace. self.write_bad_record(key, dims, unpacked.data, err_message, "Conversion Error", "missing_revision_repo") # Don't split this long string - we want to be able to find it in the code elif err_message.startswith("JSONDecodeError: Invalid control character"): self.write_bad_record(key, dims, unpacked.data, err_message, "Conversion Error", "invalid_control_char") else: # TODO: Recognize other common failure modes and handle # them gracefully. self.write_bad_record(key, dims, unpacked.data, err_message, "Conversion Error", "conversion_error") self.log(traceback.format_exc())
def main(): parser = argparse.ArgumentParser( description='Split raw logs into partitioned files.', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("-m", "--max-output-size", metavar="N", help="Rotate output files after N bytes", type=int, default=500000000) parser.add_argument("-i", "--input-file", help="Filename to read from", required=True) parser.add_argument("-o", "--output-dir", help="Base directory to store split files", required=True) parser.add_argument("-t", "--telemetry-schema", help="Filename of telemetry schema spec", required=True) parser.add_argument("-f", "--file-version", help="Log file version (if omitted, we'll guess)") args = parser.parse_args() schema_data = open(args.telemetry_schema) schema = TelemetrySchema(json.load(schema_data)) schema_data.close() storage = StorageLayout(schema, args.output_dir, args.max_output_size) expected_dim_count = len(schema._dimensions) if not os.path.isdir(args.output_dir): os.makedirs(args.output_dir) record_count = 0 bad_record_count = 0 bytes_read = 0 start = datetime.now() file_version = args.file_version if not file_version: file_version = fileutil.detect_file_version(args.input_file) for r in fileutil.unpack(args.input_file, file_version=file_version): record_count += 1 if r.error: bad_record_count += 1 continue # Incoming timestamps are in milliseconds, so convert to POSIX first # (ie. seconds) submission_date = date.fromtimestamp(r.timestamp / 1000).strftime("%Y%m%d") # Deal with unicode path = unicode(r.path, errors="replace") data = unicode(r.data, errors="replace") bytes_read += r.len_ip + r.len_path + r.len_data + fileutil.RECORD_PREAMBLE_LENGTH[ file_version] #print "Path for record", record_count, path, "length of data:", r.len_data, "data:", data[0:5] + "..." path_components = path.split("/") if len(path_components) != expected_dim_count: # We're going to pop the ID off, but we'll also add the submission # date, so it evens out. print "Found an invalid path in record", record_count, path bad_record_count += 1 continue key = path_components.pop(0) info = {} info["reason"] = path_components.pop(0) info["appName"] = path_components.pop(0) info["appVersion"] = path_components.pop(0) info["appUpdateChannel"] = path_components.pop(0) info["appBuildID"] = path_components.pop(0) dimensions = schema.dimensions_from(info, submission_date) #print " Converted path to filename", schema.get_filename(args.output_dir, dimensions) storage.write(key, data, dimensions) duration = timer.delta_sec(start) mb_read = bytes_read / 1024.0 / 1024.0 print "Read %.2fMB in %.2fs (%.2fMB/s), %d of %d records were bad" % ( mb_read, duration, mb_read / duration, bad_record_count, record_count) return 0
def handle(self, raw_file): self.log("Reading " + raw_file) try: record_count = 0 bytes_read = 0 start = now() file_version = fileutil.detect_file_version(raw_file, simple_detection=True) self.log("Detected version {0} for file {1}".format( file_version, raw_file)) for unpacked in fileutil.unpack(raw_file, file_version=file_version): record_count += 1 common_bytes = unpacked.len_path + fileutil.RECORD_PREAMBLE_LENGTH[ file_version] current_bytes = common_bytes + unpacked.len_data current_bytes_uncompressed = common_bytes + len(unpacked.data) bytes_read += current_bytes if unpacked.error: self.log("ERROR: Found corrupted data for record {0} in " \ "{1} path: {2} Error: {3}".format(record_count, raw_file, unpacked.path, unpacked.error)) self.stats.increment( records_read=1, bytes_read=current_bytes, bytes_uncompressed=current_bytes_uncompressed, bad_records=1, bad_record_type="corrupted_data") continue if len(unpacked.data) == 0: self.log("WARN: Found empty data for record {0} in " \ "{2} path: {2}".format(record_count, raw_file, unpacked.path)) self.stats.increment( records_read=1, bytes_read=current_bytes, bytes_uncompressed=current_bytes_uncompressed, bad_records=1, bad_record_type="empty_data") continue submission_date = ts_to_yyyymmdd(unpacked.timestamp) path = unicode(unpacked.path, errors="replace") if unpacked.data[0] != "{": # Data looks weird, should be JSON. self.log("Warning: Found unexpected data for record {0}" \ " in {1} path: {2} data:\n{3}".format(record_count, raw_file, path, unpacked.data)) else: # Raw JSON, make sure we treat it as unicode. unpacked.data = unicode(unpacked.data, errors="replace") path_components = path.split("/") if len(path_components) != self.expected_dim_count: # We're going to pop the ID off, but we'll also add the # submission date, so it evens out. bad_record_type = "invalid_path" if ReadRawStep.UUID_ONLY_PATH.match(path): bad_record_type = "uuid_only_path" else: self.log("Found an invalid path in record {0}: " \ "{1}".format(record_count, path)) self.stats.increment( records_read=1, bytes_read=current_bytes, bytes_uncompressed=current_bytes_uncompressed, bad_records=1, bad_record_type=bad_record_type) continue key = path_components.pop(0) info = {} info["reason"] = path_components.pop(0) info["appName"] = path_components.pop(0) info["appVersion"] = path_components.pop(0) info["appUpdateChannel"] = path_components.pop(0) info["appBuildID"] = path_components.pop(0) dims = self.schema.dimensions_from(info, submission_date) channel = self.schema.get_field(dims, "appUpdateChannel", True, True) self.stats.increment( channel=channel, records_read=1, bytes_read=current_bytes, bytes_uncompressed=current_bytes_uncompressed) try: # Convert data: if self.converter is None: serialized_data = unpacked.data # TODO: Converter.VERSION_UNCONVERTED data_version = 1 else: parsed_data, parsed_dims = self.converter.convert_json( unpacked.data, dims[-1], unpacked.ip) # TODO: take this out if it's too slow for i in range(len(dims)): if dims[i] != parsed_dims[i]: self.log("Record {0} mismatched dimension " \ "{1}: '{2}' != '{3}'".format( record_count, i, dims[1], parsed_dims[i])) serialized_data = self.converter.serialize(parsed_data) dims = parsed_dims # TODO: Converter.VERSION_CONVERTED data_version = 2 try: # Write to persistent storage n = self.storage.write(key, serialized_data, dims, data_version) self.stats.increment(channel=channel, records_written=1, bytes_written=len(key) + len(serialized_data) + 2) # Compress rotated files as we generate them if n.endswith( StorageLayout.PENDING_COMPRESSION_SUFFIX): self.q_out.put(n) except Exception, e: self.write_bad_record(key, dims, serialized_data, str(e), "ERROR Writing to output file:", "write_failed") except BadPayloadError, e: self.write_bad_record(key, dims, unpacked.data, e.msg, "Bad Payload:", "bad_payload") except Exception, e: err_message = str(e) if err_message == "Missing in payload: info.revision": # We don't need to write these bad records out - we know # why they are being skipped. self.stats.increment( channel=channel, bad_records=1, bad_record_type="missing_revision") elif err_message == "Invalid revision URL: /rev/": # We do want to log these payloads, but we don't want # the full stack trace. self.write_bad_record(key, dims, unpacked.data, err_message, "Conversion Error", "missing_revision_repo") # Don't split this long string - we want to be able to find it in the code elif err_message.startswith( "JSONDecodeError: Invalid control character"): self.write_bad_record(key, dims, unpacked.data, err_message, "Conversion Error", "invalid_control_char") else: # TODO: Recognize other common failure modes and handle # them gracefully. self.write_bad_record(key, dims, unpacked.data, err_message, "Conversion Error", "conversion_error") self.log(traceback.format_exc())
def main(): parser = argparse.ArgumentParser(description='Split raw logs into partitioned files.', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("-m", "--max-output-size", metavar="N", help="Rotate output files after N bytes", type=int, default=500000000) parser.add_argument("-i", "--input-file", help="Filename to read from", required=True) parser.add_argument("-o", "--output-dir", help="Base directory to store split files", required=True) parser.add_argument("-t", "--telemetry-schema", help="Filename of telemetry schema spec", required=True) parser.add_argument("-f", "--file-version", help="Log file version (if omitted, we'll guess)") args = parser.parse_args() schema_data = open(args.telemetry_schema) schema = TelemetrySchema(json.load(schema_data)) schema_data.close() storage = StorageLayout(schema, args.output_dir, args.max_output_size) expected_dim_count = len(schema._dimensions) if not os.path.isdir(args.output_dir): os.makedirs(args.output_dir) record_count = 0 bad_record_count = 0 bytes_read = 0 start = datetime.now() file_version = args.file_version if not file_version: file_version = fileutil.detect_file_version(args.input_file) for r in fileutil.unpack(args.input_file, file_version=file_version): record_count += 1 if r.error: bad_record_count += 1 continue # Incoming timestamps are in milliseconds, so convert to POSIX first # (ie. seconds) submission_date = date.fromtimestamp(r.timestamp / 1000).strftime("%Y%m%d") # Deal with unicode path = fileutil.to_unicode(r.path) data = fileutil.to_unicode(r.data) bytes_read += r.len_ip + r.len_path + r.len_data + fileutil.RECORD_PREAMBLE_LENGTH[file_version] #print "Path for record", record_count, path, "length of data:", r.len_data, "data:", data[0:5] + "..." path_components = path.split("/") if len(path_components) != expected_dim_count: # We're going to pop the ID off, but we'll also add the submission # date, so it evens out. print "Found an invalid path in record", record_count, path bad_record_count += 1 continue key = path_components.pop(0) info = {} info["reason"] = path_components.pop(0) info["appName"] = path_components.pop(0) info["appVersion"] = path_components.pop(0) info["appUpdateChannel"] = path_components.pop(0) info["appBuildID"] = path_components.pop(0) dimensions = schema.dimensions_from(info, submission_date) #print " Converted path to filename", schema.get_filename(args.output_dir, dimensions) storage.write(key, data, dimensions) duration = timer.delta_sec(start) mb_read = bytes_read / 1024.0 / 1024.0 print "Read %.2fMB in %.2fs (%.2fMB/s), %d of %d records were bad" % (mb_read, duration, mb_read / duration, bad_record_count, record_count) return 0