def main(): parser = ArgumentParser( description='Convert local Telemetry pings to server storage structure' ) parser.add_argument("--input-dir", required=True) parser.add_argument("--output-dir", required=True) parser.add_argument("--schema", type=file, default='./telemetry/telemetry_schema.json') parser.add_argument("--histogram-cache-dir", default='/tmp/telemetry_histogram_cache') args = parser.parse_args() print "Getting pings from", args.input_dir, "converting them and storing them in", args.output_dir schema = TelemetrySchema(json.load(args.schema)) cache = RevisionCache(args.histogram_cache_dir, 'hg.mozilla.org') converter = Converter(cache, schema) storage = StorageLayout(schema, args.output_dir, 500000000) ping_dir = args.input_dir ping_files = get_pings(ping_dir) if len(ping_files) == 0: # Try the usual ping dir (if the user just gave the Profile Dir) ping_dir = os.path.join(args.input_dir, "saved-telemetry-pings") ping_files = get_pings(ping_dir) print "found", len(ping_files), "pings" for ping_file in ping_files: with open(os.path.join(ping_dir, ping_file), "r") as f: ping = json.load(f) reason = ping['reason'] key = ping['slug'] payload = ping['payload'] submission_date = date.today().strftime("%Y%m%d") dims = schema.dimensions_from(payload, submission_date) try: parsed_data, dims = converter.convert_obj(payload, dims[-1]) serialized_data = converter.serialize(parsed_data) data_version = Converter.VERSION_CONVERTED try: # Write to persistent storage n = storage.write(key, serialized_data, dims, data_version) print "Successfully saved ping", key, "to", n except Exception, e: traceback.print_exc() except BadPayloadError, e: print "Bad Payload:", e.msg except Exception, e: traceback.print_exc()
def main(): parser = ArgumentParser(description='Convert local Telemetry pings to server storage structure') parser.add_argument("--input-dir", required=True) parser.add_argument("--output-dir", required=True) parser.add_argument("--schema", type=file, default='./telemetry/telemetry_schema.json') parser.add_argument("--histogram-cache-dir", default='/tmp/telemetry_histogram_cache') args = parser.parse_args() print "Getting pings from", args.input_dir, "converting them and storing them in", args.output_dir schema = TelemetrySchema(json.load(args.schema)) cache = RevisionCache(args.histogram_cache_dir, 'hg.mozilla.org') converter = Converter(cache, schema) storage = StorageLayout(schema, args.output_dir, 500000000) ping_dir = args.input_dir ping_files = get_pings(ping_dir) if len(ping_files) == 0: # Try the usual ping dir (if the user just gave the Profile Dir) ping_dir = os.path.join(args.input_dir, "saved-telemetry-pings") ping_files = get_pings(ping_dir) print "found", len(ping_files), "pings" for ping_file in ping_files: with open(os.path.join(ping_dir, ping_file), "r") as f: ping = json.load(f) reason = ping['reason'] key = ping['slug'] payload = ping['payload'] submission_date = date.today().strftime("%Y%m%d") dims = schema.dimensions_from(payload, submission_date) try: parsed_data, dims = converter.convert_obj(payload, dims[-1]) serialized_data = converter.serialize(parsed_data) data_version = Converter.VERSION_CONVERTED try: # Write to persistent storage n = storage.write(key, serialized_data, dims, data_version) print "Successfully saved ping", key, "to", n except Exception, e: traceback.print_exc() except BadPayloadError, e: print "Bad Payload:", e.msg except Exception, e: traceback.print_exc()
def main(): parser = argparse.ArgumentParser( description='Split raw logs into partitioned files.', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("-m", "--max-output-size", metavar="N", help="Rotate output files after N bytes", type=int, default=500000000) parser.add_argument("-i", "--input-file", help="Filename to read from", required=True) parser.add_argument("-o", "--output-dir", help="Base directory to store split files", required=True) parser.add_argument("-t", "--telemetry-schema", help="Filename of telemetry schema spec", required=True) parser.add_argument("-f", "--file-version", help="Log file version (if omitted, we'll guess)") args = parser.parse_args() schema_data = open(args.telemetry_schema) schema = TelemetrySchema(json.load(schema_data)) schema_data.close() storage = StorageLayout(schema, args.output_dir, args.max_output_size) expected_dim_count = len(schema._dimensions) if not os.path.isdir(args.output_dir): os.makedirs(args.output_dir) record_count = 0 bad_record_count = 0 bytes_read = 0 start = datetime.now() file_version = args.file_version if not file_version: file_version = fileutil.detect_file_version(args.input_file) for r in fileutil.unpack(args.input_file, file_version=file_version): record_count += 1 if r.error: bad_record_count += 1 continue # Incoming timestamps are in milliseconds, so convert to POSIX first # (ie. seconds) submission_date = date.fromtimestamp(r.timestamp / 1000).strftime("%Y%m%d") # Deal with unicode path = unicode(r.path, errors="replace") data = unicode(r.data, errors="replace") bytes_read += r.len_ip + r.len_path + r.len_data + fileutil.RECORD_PREAMBLE_LENGTH[ file_version] #print "Path for record", record_count, path, "length of data:", r.len_data, "data:", data[0:5] + "..." path_components = path.split("/") if len(path_components) != expected_dim_count: # We're going to pop the ID off, but we'll also add the submission # date, so it evens out. print "Found an invalid path in record", record_count, path bad_record_count += 1 continue key = path_components.pop(0) info = {} info["reason"] = path_components.pop(0) info["appName"] = path_components.pop(0) info["appVersion"] = path_components.pop(0) info["appUpdateChannel"] = path_components.pop(0) info["appBuildID"] = path_components.pop(0) dimensions = schema.dimensions_from(info, submission_date) #print " Converted path to filename", schema.get_filename(args.output_dir, dimensions) storage.write(key, data, dimensions) duration = timer.delta_sec(start) mb_read = bytes_read / 1024.0 / 1024.0 print "Read %.2fMB in %.2fs (%.2fMB/s), %d of %d records were bad" % ( mb_read, duration, mb_read / duration, bad_record_count, record_count) return 0
class TestPersist(unittest.TestCase): def setUp(self): test_dir = self.get_test_dir() self.schema = TelemetrySchema(self.get_schema_spec()) self.storage = StorageLayout(self.schema, test_dir, 10000) assert not os.path.exists(test_dir) os.makedirs(test_dir) def tearDown(self): shutil.rmtree(self.get_test_dir()) def get_test_dir(self): return "/tmp/test_telemetry_persist" def get_schema_spec(self): return { "version": 1, "dimensions": [ { "field_name": "reason", "allowed_values": ["r1","r2"] }, { "field_name": "appName", "allowed_values": ["a1"] }, { "field_name": "appUpdateChannel", "allowed_values": ["c1", "c2", "c3"] }, { "field_name": "appVersion", "allowed_values": "*" }, { "field_name": "appBuildID", "allowed_values": "*" }, { "field_name": "submission_date", "allowed_values": { "min": "20130101", "max": "20131231" } } ] } def test_write_filename(self): test_file = os.path.join(self.get_test_dir(), "test.log") self.storage.write_filename("foo", '{"bar":"baz"}', test_file) test_file_md5, test_file_size = fileutil.md5file(test_file) self.assertEqual(test_file_md5, "0ea91df239ea79ed2ebab34b46d455fc") test_file = os.path.join(self.get_test_dir(), "test2.log") # Now test writing an object self.storage.write_filename("foo", {"bar":"baz"}, test_file) test_file_md5, test_file_size = fileutil.md5file(test_file) self.assertEqual(test_file_md5, "0ea91df239ea79ed2ebab34b46d455fc") def test_write(self): dims = ["r1", "a1", "c1", "v1", "b1", "20130102"] test_dir = self.get_test_dir() test_file = self.schema.get_filename(test_dir, dims) self.assertEquals(test_file, test_dir + "/r1/a1/c1/v1/b1.20130102.v1.log") self.storage.write("foo", '{"bar":"baz"}', dims) md5, size = fileutil.md5file(test_file) self.assertEqual(md5, "0ea91df239ea79ed2ebab34b46d455fc") def test_clean_newlines(self): self.assertEqual(self.storage.clean_newlines("ab\n\ncd\r\n"), "ab cd ") def test_rotate(self): test_file = os.path.join(self.get_test_dir(), "test.log") key = "01234567890123456789012345678901234567890123456789" value = '{"some filler stuff here":"fffffffffffffffffff"}' # each iteration should be 100 bytes. for i in range(99): result = self.storage.write_filename(key, value, test_file) self.assertEquals(result, test_file) # The 100th iteration should cause the file to rotate rolled = self.storage.write_filename(key, value, test_file) # rolled should be <test_dir>/test.log.<pid>.<timestamp><suffix> self.assertNotEqual(rolled, test_file) self.assertTrue(rolled.startswith(test_file)) self.assertTrue(rolled.endswith(StorageLayout.PENDING_COMPRESSION_SUFFIX))
def main(): parser = argparse.ArgumentParser( description="Split raw logs into partitioned files.", formatter_class=argparse.ArgumentDefaultsHelpFormatter ) parser.add_argument( "-m", "--max-output-size", metavar="N", help="Rotate output files after N bytes", type=int, default=500000000 ) parser.add_argument("-i", "--input-file", help="Filename to read from", required=True) parser.add_argument("-o", "--output-dir", help="Base directory to store split files", required=True) parser.add_argument("-t", "--telemetry-schema", help="Filename of telemetry schema spec", required=True) parser.add_argument("-b", "--bucket", help="S3 Bucket name") parser.add_argument("-k", "--aws-key", help="AWS Key") parser.add_argument("-s", "--aws-secret-key", help="AWS Secret Key") args = parser.parse_args() schema_data = open(args.telemetry_schema) schema = TelemetrySchema(json.load(schema_data)) schema_data.close() storage = StorageLayout(schema, args.output_dir, args.max_output_size) expected_dim_count = len(schema._dimensions) if not os.path.isdir(args.output_dir): os.makedirs(args.output_dir) record_count = 0 bad_record_count = 0 bytes_read = 0 start = datetime.now() for len_path, len_data, timestamp, path, data, err in fileutil.unpack(args.input_file): record_count += 1 if err: bad_record_count += 1 continue # Incoming timestamps are in milliseconds, so convert to POSIX first # (ie. seconds) submission_date = date.fromtimestamp(timestamp / 1000).strftime("%Y%m%d") # Deal with unicode path = unicode(path, errors="replace") data = unicode(data, errors="replace") bytes_read += len_path + len_data + fileutil.RECORD_PREAMBLE_LENGTH # print "Path for record", record_count, path, "length of data:", len_data, "data:", data[0:5] + "..." path_components = path.split("/") if len(path_components) != expected_dim_count: # We're going to pop the ID off, but we'll also add the submission # date, so it evens out. print "Found an invalid path in record", record_count, path bad_record_count += 1 continue key = path_components.pop(0) info = {} info["reason"] = path_components.pop(0) info["appName"] = path_components.pop(0) info["appVersion"] = path_components.pop(0) info["appUpdateChannel"] = path_components.pop(0) info["appBuildID"] = path_components.pop(0) dimensions = schema.dimensions_from(info, submission_date) # print " Converted path to filename", schema.get_filename(args.output_dir, dimensions) storage.write(key, data, dimensions) duration = timer.delta_sec(start) mb_read = bytes_read / 1024.0 / 1024.0 print "Read %.2fMB in %.2fs (%.2fMB/s), %d of %d records were bad" % ( mb_read, duration, mb_read / duration, bad_record_count, record_count, ) return 0
class TestPersist(unittest.TestCase): def setUp(self): test_dir = self.get_test_dir() self.schema = TelemetrySchema(self.get_schema_spec()) self.storage = StorageLayout(self.schema, test_dir, 10000) assert not os.path.exists(test_dir) os.makedirs(test_dir) def tearDown(self): shutil.rmtree(self.get_test_dir()) def get_test_dir(self): return "/tmp/test_telemetry_persist" def get_schema_spec(self): return { "version": 1, "dimensions": [{ "field_name": "reason", "allowed_values": ["r1", "r2"] }, { "field_name": "appName", "allowed_values": ["a1"] }, { "field_name": "appUpdateChannel", "allowed_values": ["c1", "c2", "c3"] }, { "field_name": "appVersion", "allowed_values": "*" }, { "field_name": "appBuildID", "allowed_values": "*" }, { "field_name": "submission_date", "allowed_values": { "min": "20130101", "max": "20131231" } }] } def test_write_filename(self): test_file = os.path.join(self.get_test_dir(), "test.log") self.storage.write_filename("foo", '{"bar":"baz"}', test_file) test_file_md5, test_file_size = fileutil.md5file(test_file) self.assertEqual(test_file_md5, "0ea91df239ea79ed2ebab34b46d455fc") test_file = os.path.join(self.get_test_dir(), "test2.log") # Now test writing an object self.storage.write_filename("foo", {"bar": "baz"}, test_file) test_file_md5, test_file_size = fileutil.md5file(test_file) self.assertEqual(test_file_md5, "0ea91df239ea79ed2ebab34b46d455fc") def test_write(self): dims = ["r1", "a1", "c1", "v1", "b1", "20130102"] test_dir = self.get_test_dir() test_file = self.schema.get_filename(test_dir, dims) self.assertEquals(test_file, test_dir + "/r1/a1/c1/v1/b1.20130102.v1.log") self.storage.write("foo", '{"bar":"baz"}', dims) md5, size = fileutil.md5file(test_file) self.assertEqual(md5, "0ea91df239ea79ed2ebab34b46d455fc") def test_clean_newlines(self): self.assertEqual(self.storage.clean_newlines("ab\n\ncd\r\n"), "ab cd ") def test_rotate(self): test_file = os.path.join(self.get_test_dir(), "test.log") key = "01234567890123456789012345678901234567890123456789" value = '{"some filler stuff here":"fffffffffffffffffff"}' # each iteration should be 100 bytes. for i in range(99): result = self.storage.write_filename(key, value, test_file) self.assertEquals(result, test_file) # The 100th iteration should cause the file to rotate rolled = self.storage.write_filename(key, value, test_file) # rolled should be <test_dir>/test.log.<pid>.<timestamp><suffix> self.assertNotEqual(rolled, test_file) self.assertTrue(rolled.startswith(test_file)) self.assertTrue( rolled.endswith(StorageLayout.PENDING_COMPRESSION_SUFFIX))