def main(): parser = ArgumentParser( description='Convert local Telemetry pings to server storage structure' ) parser.add_argument("--input-dir", required=True) parser.add_argument("--output-dir", required=True) parser.add_argument("--schema", type=file, default='./telemetry/telemetry_schema.json') parser.add_argument("--histogram-cache-dir", default='/tmp/telemetry_histogram_cache') args = parser.parse_args() print "Getting pings from", args.input_dir, "converting them and storing them in", args.output_dir schema = TelemetrySchema(json.load(args.schema)) cache = RevisionCache(args.histogram_cache_dir, 'hg.mozilla.org') converter = Converter(cache, schema) storage = StorageLayout(schema, args.output_dir, 500000000) ping_dir = args.input_dir ping_files = get_pings(ping_dir) if len(ping_files) == 0: # Try the usual ping dir (if the user just gave the Profile Dir) ping_dir = os.path.join(args.input_dir, "saved-telemetry-pings") ping_files = get_pings(ping_dir) print "found", len(ping_files), "pings" for ping_file in ping_files: with open(os.path.join(ping_dir, ping_file), "r") as f: ping = json.load(f) reason = ping['reason'] key = ping['slug'] payload = ping['payload'] submission_date = date.today().strftime("%Y%m%d") dims = schema.dimensions_from(payload, submission_date) try: parsed_data, dims = converter.convert_obj(payload, dims[-1]) serialized_data = converter.serialize(parsed_data) data_version = Converter.VERSION_CONVERTED try: # Write to persistent storage n = storage.write(key, serialized_data, dims, data_version) print "Successfully saved ping", key, "to", n except Exception, e: traceback.print_exc() except BadPayloadError, e: print "Bad Payload:", e.msg except Exception, e: traceback.print_exc()
def main(): parser = ArgumentParser(description='Convert local Telemetry pings to server storage structure') parser.add_argument("--input-dir", required=True) parser.add_argument("--output-dir", required=True) parser.add_argument("--schema", type=file, default='./telemetry/telemetry_schema.json') parser.add_argument("--histogram-cache-dir", default='/tmp/telemetry_histogram_cache') args = parser.parse_args() print "Getting pings from", args.input_dir, "converting them and storing them in", args.output_dir schema = TelemetrySchema(json.load(args.schema)) cache = RevisionCache(args.histogram_cache_dir, 'hg.mozilla.org') converter = Converter(cache, schema) storage = StorageLayout(schema, args.output_dir, 500000000) ping_dir = args.input_dir ping_files = get_pings(ping_dir) if len(ping_files) == 0: # Try the usual ping dir (if the user just gave the Profile Dir) ping_dir = os.path.join(args.input_dir, "saved-telemetry-pings") ping_files = get_pings(ping_dir) print "found", len(ping_files), "pings" for ping_file in ping_files: with open(os.path.join(ping_dir, ping_file), "r") as f: ping = json.load(f) reason = ping['reason'] key = ping['slug'] payload = ping['payload'] submission_date = date.today().strftime("%Y%m%d") dims = schema.dimensions_from(payload, submission_date) try: parsed_data, dims = converter.convert_obj(payload, dims[-1]) serialized_data = converter.serialize(parsed_data) data_version = Converter.VERSION_CONVERTED try: # Write to persistent storage n = storage.write(key, serialized_data, dims, data_version) print "Successfully saved ping", key, "to", n except Exception, e: traceback.print_exc() except BadPayloadError, e: print "Bad Payload:", e.msg except Exception, e: traceback.print_exc()
"allowed_values": ["c1", "c2", "c3"] }, { "field_name": "appVersion", "allowed_values": "*" }, { "field_name": "appBuildID", "allowed_values": "*" }, { "field_name": "submission_date", "allowed_values": { "min": "20130101", "max": "20131231" } } ] } try: schema = TelemetrySchema(schema_spec) storage = StorageLayout(schema, test_dir, 10000) test_file_1 = os.path.join(test_dir, "test.log") storage.write_filename("foo", '{"bar": "baz"}', test_file_1) test_file_1_md5, test_file_1_size = fileutil.md5file(test_file_1) assert test_file_1_md5 == "206dd2d33a04802c31d2c74f10cc472b" assert storage.clean_newlines("ab\n\ncd\r\n") == "ab cd " finally: shutil.rmtree(test_dir)
def main(): signal.signal(signal.SIGINT, handle_sigint) parser = argparse.ArgumentParser( description='Process incoming Telemetry data', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("-c", "--config", required=True, type=file, help="AWS Configuration file (json)") parser.add_argument("-w", "--work-dir", required=True, help="Location to cache downloaded files") parser.add_argument("-o", "--output-dir", required=True, help="Base dir to store processed data") parser.add_argument("-i", "--input-files", type=file, help="File containing a list of keys to process") parser.add_argument("-b", "--bad-data-log", help="Save bad records to this file") parser.add_argument("-l", "--log-file", help="Log output to this file") parser.add_argument("-s", "--stats-file", help="Log statistics to this file") parser.add_argument("--histogram-cache-path", default="./histogram_cache", help="Path to store a local cache of histograms") parser.add_argument("-t", "--telemetry-schema", required=True, help="Location of the desired telemetry schema") parser.add_argument("-m", "--max-output-size", metavar="N", type=int, default=500000000, help="Rotate output files after N bytes") parser.add_argument("-D", "--dry-run", action="store_true", help="Don't modify remote files") parser.add_argument("-n", "--no-clean", action="store_true", help="Don't clean out the output-dir before beginning") parser.add_argument("-v", "--verbose", action="store_true", help="Print more detailed output") args = parser.parse_args() if args.verbose: # Turn on mp logging multiprocessing.log_to_stderr(logging.DEBUG) config = json.load(args.config) # TODO: allow commandline args to override config values. if not os.path.isdir(args.output_dir): os.makedirs(args.output_dir) schema_data = open(args.telemetry_schema) schema = TelemetrySchema(json.load(schema_data)) schema_data.close() cache = RevisionCache(args.histogram_cache_path, "hg.mozilla.org") converter = Converter(cache, schema) storage = StorageLayout(schema, args.output_dir, args.max_output_size) logger = Log(args.log_file, "Master") num_cpus = multiprocessing.cpu_count() conn = None incoming_bucket = None incoming_queue = None s3downloader = None raw_readers = None compressors = None exporters = None done = False if args.no_clean: logger.log("Not removing log files in {}".format(args.output_dir)) else: # Remove existing log files from output_dir (to clean up after an # incomplete previous run, for example). logger.log("Removing log files in {}".format(args.output_dir)) for root, dirs, files in os.walk(args.output_dir): for f in files: if f.endswith(".log"): full = os.path.join(root, f) if args.dry_run: logger.log("Would be deleting {}, except it's a " \ "dry run".format(full)) else: try: logger.log("Removing existing file: " + full) os.remove(full) except Exception, e: logger.log("Error removing existing " \ " file {}: {}".format(full, e))
def main(): parser = argparse.ArgumentParser( description='Process incoming Telemetry data', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("incoming_bucket", help="The S3 bucket containing incoming files") parser.add_argument("publish_bucket", help="The S3 bucket to save processed files") parser.add_argument("-k", "--aws-key", help="AWS Key", required=True) parser.add_argument("-s", "--aws-secret-key", help="AWS Secret Key", required=True) parser.add_argument("-r", "--aws-region", help="AWS Region", default="us-west-2") parser.add_argument("-w", "--work-dir", help="Location to cache downloaded files", required=True) parser.add_argument("-o", "--output-dir", help="Base dir to store processed data", required=True) parser.add_argument("-i", "--input-files", help="File containing a list of keys to process", type=file) parser.add_argument("-b", "--bad-data-log", help="Save bad records to this file") parser.add_argument("-q", "--queue", help="SQS Queue name to poll for incoming data") parser.add_argument("-c", "--histogram-cache-path", help="Path to store a local cache of histograms", default="./histogram_cache") parser.add_argument("-t", "--telemetry-schema", help="Location of the desired telemetry schema", required=True) parser.add_argument("-m", "--max-output-size", metavar="N", help="Rotate output files after N bytes", type=int, default=500000000) parser.add_argument("-D", "--dry-run", help="Don't modify remote files", action="store_true") parser.add_argument("-C", "--skip-conversion", help="Skip validation/conversion of payloads", action="store_true") args = parser.parse_args() if not os.path.isfile(S3FUNNEL_PATH): print "ERROR: s3funnel not found at", S3FUNNEL_PATH print "You can get it from github: https://github.com/sstoiana/s3funnel" return -1 if not os.path.isdir(args.output_dir): os.makedirs(args.output_dir) schema_data = open(args.telemetry_schema) schema = TelemetrySchema(json.load(schema_data)) schema_data.close() cache = RevisionCache(args.histogram_cache_path, "hg.mozilla.org") if args.skip_conversion: converter = None else: converter = Converter(cache, schema) storage = StorageLayout(schema, args.output_dir, args.max_output_size) num_cpus = multiprocessing.cpu_count() start = datetime.now() conn = None incoming_bucket = None incoming_queue = None incoming_queue_messages = [] if not args.dry_run: conn = S3Connection(args.aws_key, args.aws_secret_key) incoming_bucket = conn.get_bucket(args.incoming_bucket) incoming_filenames = [] if args.queue is not None: print "Fetching file list from queue", args.queue if args.dry_run: print "Dry run mode... can't read from the queue without messing things up..." else: q_conn = boto.sqs.connect_to_region( args.aws_region, aws_access_key_id=args.aws_key, aws_secret_access_key=args.aws_secret_key) incoming_queue = q_conn.get_queue(args.queue) if incoming_queue is None: print "Error: could not get queue", args.queue return -2 # Sometimes we don't get all the messages, even if more are # available, so keep trying until we have enough (or there aren't # any left) for i in range(num_cpus): messages = incoming_queue.get_messages(num_cpus - len(incoming_filenames)) for m in messages: # TODO: Make sure this file exists in S3 first? possible_filename = m.get_body() key = incoming_bucket.get_key(possible_filename) if key is None: print "Could not find queued filename in bucket", args.incoming_bucket, ":", possible_filename # try to delete it: incoming_queue.delete_message(m) else: incoming_filenames.append(possible_filename) incoming_queue_messages.append(m) if len(messages) == 0 or len(incoming_filenames) >= num_cpus: break elif args.input_files: print "Fetching file list from file", args.input_files incoming_filenames = [l.strip() for l in args.input_files.readlines()] else: print "Fetching file list from S3..." for f in incoming_bucket.list(): incoming_filenames.append(f.name) print "Done" if len(incoming_filenames) == 0: print "Nothing to do!" return 0 for f in incoming_filenames: print " ", f print "Verifying that we can write to", args.publish_bucket if args.dry_run: print "Dry run mode: don't care!" else: try: publish_bucket = conn.get_bucket(args.publish_bucket) print "Looks good!" except S3ResponseError: print "Bucket", args.publish_bucket, "not found. Attempting to create it." publish_bucket = conn.create_bucket(args.publish_bucket) result = 0 print "Downloading", len(incoming_filenames), "files..." if args.dry_run: print "Dry run mode: skipping download from S3" else: result = fetch_s3_files(incoming_filenames, args.work_dir, incoming_bucket, args.aws_key, args.aws_secret_key) if result != 0: print "Error downloading files. Return code of s3funnel was", result return result print "Done" after_download = datetime.now() local_filenames = [ os.path.join(args.work_dir, f) for f in incoming_filenames ] # TODO: try a SimpleQueue raw_files = Queue() for l in local_filenames: raw_files.put(l) completed_files = Queue() compressed_files = Queue() # Begin reading raw input raw_readers = start_workers( num_cpus, "Reader", ReadRawStep, raw_files, (completed_files, schema, converter, storage, args.bad_data_log)) # Tell readers when to stop: for i in range(num_cpus): raw_files.put(PipeStep.SENTINEL) # Compress completed files. compressors = start_workers(num_cpus, "Compressor", CompressCompletedStep, completed_files, (compressed_files, )) # Export compressed files to S3. exporters = start_workers( num_cpus, "Exporter", ExportCompressedStep, compressed_files, (args.output_dir, args.aws_key, args.aws_secret_key, args.publish_bucket, args.dry_run)) wait_for(raw_readers, "Raw Readers") # `find <out_dir> -type f -not -name ".compressme"` # Add them to completed_files for root, dirs, files in os.walk(args.output_dir): for f in files: if f.endswith(".log"): completed_files.put(os.path.join(root, f)) for i in range(num_cpus): completed_files.put(PipeStep.SENTINEL) wait_for(compressors, "Compressors") for i in range(num_cpus): compressed_files.put(PipeStep.SENTINEL) wait_for(exporters, "Exporters") print "Removing processed logs from S3..." for f in incoming_filenames: if args.dry_run: print " Dry run, so not really deleting", f else: print " Deleting", f incoming_bucket.delete_key(f) # Delete file locally too. os.remove(os.path.join(args.work_dir, f)) print "Done" if len(incoming_queue_messages) > 0: print "Removing processed messages from SQS..." for m in incoming_queue_messages: if args.dry_run: print " Dry run, so not really deleting", m.get_body() else: print " Deleting", m.get_body() if incoming_queue.delete_message(m): print " Message deleted successfully" else: print " Failed to delete message :(" print "Done" duration = timer.delta_sec(start) print "All done in %.2fs (%.2fs excluding download time)" % ( duration, timer.delta_sec(after_download)) return 0
def main(): parser = argparse.ArgumentParser( description='Split raw logs into partitioned files.', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("-m", "--max-output-size", metavar="N", help="Rotate output files after N bytes", type=int, default=500000000) parser.add_argument("-i", "--input-file", help="Filename to read from", required=True) parser.add_argument("-o", "--output-dir", help="Base directory to store split files", required=True) parser.add_argument("-t", "--telemetry-schema", help="Filename of telemetry schema spec", required=True) parser.add_argument("-f", "--file-version", help="Log file version (if omitted, we'll guess)") args = parser.parse_args() schema_data = open(args.telemetry_schema) schema = TelemetrySchema(json.load(schema_data)) schema_data.close() storage = StorageLayout(schema, args.output_dir, args.max_output_size) expected_dim_count = len(schema._dimensions) if not os.path.isdir(args.output_dir): os.makedirs(args.output_dir) record_count = 0 bad_record_count = 0 bytes_read = 0 start = datetime.now() file_version = args.file_version if not file_version: file_version = fileutil.detect_file_version(args.input_file) for r in fileutil.unpack(args.input_file, file_version=file_version): record_count += 1 if r.error: bad_record_count += 1 continue # Incoming timestamps are in milliseconds, so convert to POSIX first # (ie. seconds) submission_date = date.fromtimestamp(r.timestamp / 1000).strftime("%Y%m%d") # Deal with unicode path = unicode(r.path, errors="replace") data = unicode(r.data, errors="replace") bytes_read += r.len_ip + r.len_path + r.len_data + fileutil.RECORD_PREAMBLE_LENGTH[ file_version] #print "Path for record", record_count, path, "length of data:", r.len_data, "data:", data[0:5] + "..." path_components = path.split("/") if len(path_components) != expected_dim_count: # We're going to pop the ID off, but we'll also add the submission # date, so it evens out. print "Found an invalid path in record", record_count, path bad_record_count += 1 continue key = path_components.pop(0) info = {} info["reason"] = path_components.pop(0) info["appName"] = path_components.pop(0) info["appVersion"] = path_components.pop(0) info["appUpdateChannel"] = path_components.pop(0) info["appBuildID"] = path_components.pop(0) dimensions = schema.dimensions_from(info, submission_date) #print " Converted path to filename", schema.get_filename(args.output_dir, dimensions) storage.write(key, data, dimensions) duration = timer.delta_sec(start) mb_read = bytes_read / 1024.0 / 1024.0 print "Read %.2fMB in %.2fs (%.2fMB/s), %d of %d records were bad" % ( mb_read, duration, mb_read / duration, bad_record_count, record_count) return 0
def setUp(self): test_dir = self.get_test_dir() self.schema = TelemetrySchema(self.get_schema_spec()) self.storage = StorageLayout(self.schema, test_dir, 10000) assert not os.path.exists(test_dir) os.makedirs(test_dir)
class TestPersist(unittest.TestCase): def setUp(self): test_dir = self.get_test_dir() self.schema = TelemetrySchema(self.get_schema_spec()) self.storage = StorageLayout(self.schema, test_dir, 10000) assert not os.path.exists(test_dir) os.makedirs(test_dir) def tearDown(self): shutil.rmtree(self.get_test_dir()) def get_test_dir(self): return "/tmp/test_telemetry_persist" def get_schema_spec(self): return { "version": 1, "dimensions": [ { "field_name": "reason", "allowed_values": ["r1","r2"] }, { "field_name": "appName", "allowed_values": ["a1"] }, { "field_name": "appUpdateChannel", "allowed_values": ["c1", "c2", "c3"] }, { "field_name": "appVersion", "allowed_values": "*" }, { "field_name": "appBuildID", "allowed_values": "*" }, { "field_name": "submission_date", "allowed_values": { "min": "20130101", "max": "20131231" } } ] } def test_write_filename(self): test_file = os.path.join(self.get_test_dir(), "test.log") self.storage.write_filename("foo", '{"bar":"baz"}', test_file) test_file_md5, test_file_size = fileutil.md5file(test_file) self.assertEqual(test_file_md5, "0ea91df239ea79ed2ebab34b46d455fc") test_file = os.path.join(self.get_test_dir(), "test2.log") # Now test writing an object self.storage.write_filename("foo", {"bar":"baz"}, test_file) test_file_md5, test_file_size = fileutil.md5file(test_file) self.assertEqual(test_file_md5, "0ea91df239ea79ed2ebab34b46d455fc") def test_write(self): dims = ["r1", "a1", "c1", "v1", "b1", "20130102"] test_dir = self.get_test_dir() test_file = self.schema.get_filename(test_dir, dims) self.assertEquals(test_file, test_dir + "/r1/a1/c1/v1/b1.20130102.v1.log") self.storage.write("foo", '{"bar":"baz"}', dims) md5, size = fileutil.md5file(test_file) self.assertEqual(md5, "0ea91df239ea79ed2ebab34b46d455fc") def test_clean_newlines(self): self.assertEqual(self.storage.clean_newlines("ab\n\ncd\r\n"), "ab cd ") def test_rotate(self): test_file = os.path.join(self.get_test_dir(), "test.log") key = "01234567890123456789012345678901234567890123456789" value = '{"some filler stuff here":"fffffffffffffffffff"}' # each iteration should be 100 bytes. for i in range(99): result = self.storage.write_filename(key, value, test_file) self.assertEquals(result, test_file) # The 100th iteration should cause the file to rotate rolled = self.storage.write_filename(key, value, test_file) # rolled should be <test_dir>/test.log.<pid>.<timestamp><suffix> self.assertNotEqual(rolled, test_file) self.assertTrue(rolled.startswith(test_file)) self.assertTrue(rolled.endswith(StorageLayout.PENDING_COMPRESSION_SUFFIX))
def main(): parser = argparse.ArgumentParser( description="Split raw logs into partitioned files.", formatter_class=argparse.ArgumentDefaultsHelpFormatter ) parser.add_argument( "-m", "--max-output-size", metavar="N", help="Rotate output files after N bytes", type=int, default=500000000 ) parser.add_argument("-i", "--input-file", help="Filename to read from", required=True) parser.add_argument("-o", "--output-dir", help="Base directory to store split files", required=True) parser.add_argument("-t", "--telemetry-schema", help="Filename of telemetry schema spec", required=True) parser.add_argument("-b", "--bucket", help="S3 Bucket name") parser.add_argument("-k", "--aws-key", help="AWS Key") parser.add_argument("-s", "--aws-secret-key", help="AWS Secret Key") args = parser.parse_args() schema_data = open(args.telemetry_schema) schema = TelemetrySchema(json.load(schema_data)) schema_data.close() storage = StorageLayout(schema, args.output_dir, args.max_output_size) expected_dim_count = len(schema._dimensions) if not os.path.isdir(args.output_dir): os.makedirs(args.output_dir) record_count = 0 bad_record_count = 0 bytes_read = 0 start = datetime.now() for len_path, len_data, timestamp, path, data, err in fileutil.unpack(args.input_file): record_count += 1 if err: bad_record_count += 1 continue # Incoming timestamps are in milliseconds, so convert to POSIX first # (ie. seconds) submission_date = date.fromtimestamp(timestamp / 1000).strftime("%Y%m%d") # Deal with unicode path = unicode(path, errors="replace") data = unicode(data, errors="replace") bytes_read += len_path + len_data + fileutil.RECORD_PREAMBLE_LENGTH # print "Path for record", record_count, path, "length of data:", len_data, "data:", data[0:5] + "..." path_components = path.split("/") if len(path_components) != expected_dim_count: # We're going to pop the ID off, but we'll also add the submission # date, so it evens out. print "Found an invalid path in record", record_count, path bad_record_count += 1 continue key = path_components.pop(0) info = {} info["reason"] = path_components.pop(0) info["appName"] = path_components.pop(0) info["appVersion"] = path_components.pop(0) info["appUpdateChannel"] = path_components.pop(0) info["appBuildID"] = path_components.pop(0) dimensions = schema.dimensions_from(info, submission_date) # print " Converted path to filename", schema.get_filename(args.output_dir, dimensions) storage.write(key, data, dimensions) duration = timer.delta_sec(start) mb_read = bytes_read / 1024.0 / 1024.0 print "Read %.2fMB in %.2fs (%.2fMB/s), %d of %d records were bad" % ( mb_read, duration, mb_read / duration, bad_record_count, record_count, ) return 0
class TestPersist(unittest.TestCase): def setUp(self): test_dir = self.get_test_dir() self.schema = TelemetrySchema(self.get_schema_spec()) self.storage = StorageLayout(self.schema, test_dir, 10000) assert not os.path.exists(test_dir) os.makedirs(test_dir) def tearDown(self): shutil.rmtree(self.get_test_dir()) def get_test_dir(self): return "/tmp/test_telemetry_persist" def get_schema_spec(self): return { "version": 1, "dimensions": [{ "field_name": "reason", "allowed_values": ["r1", "r2"] }, { "field_name": "appName", "allowed_values": ["a1"] }, { "field_name": "appUpdateChannel", "allowed_values": ["c1", "c2", "c3"] }, { "field_name": "appVersion", "allowed_values": "*" }, { "field_name": "appBuildID", "allowed_values": "*" }, { "field_name": "submission_date", "allowed_values": { "min": "20130101", "max": "20131231" } }] } def test_write_filename(self): test_file = os.path.join(self.get_test_dir(), "test.log") self.storage.write_filename("foo", '{"bar":"baz"}', test_file) test_file_md5, test_file_size = fileutil.md5file(test_file) self.assertEqual(test_file_md5, "0ea91df239ea79ed2ebab34b46d455fc") test_file = os.path.join(self.get_test_dir(), "test2.log") # Now test writing an object self.storage.write_filename("foo", {"bar": "baz"}, test_file) test_file_md5, test_file_size = fileutil.md5file(test_file) self.assertEqual(test_file_md5, "0ea91df239ea79ed2ebab34b46d455fc") def test_write(self): dims = ["r1", "a1", "c1", "v1", "b1", "20130102"] test_dir = self.get_test_dir() test_file = self.schema.get_filename(test_dir, dims) self.assertEquals(test_file, test_dir + "/r1/a1/c1/v1/b1.20130102.v1.log") self.storage.write("foo", '{"bar":"baz"}', dims) md5, size = fileutil.md5file(test_file) self.assertEqual(md5, "0ea91df239ea79ed2ebab34b46d455fc") def test_clean_newlines(self): self.assertEqual(self.storage.clean_newlines("ab\n\ncd\r\n"), "ab cd ") def test_rotate(self): test_file = os.path.join(self.get_test_dir(), "test.log") key = "01234567890123456789012345678901234567890123456789" value = '{"some filler stuff here":"fffffffffffffffffff"}' # each iteration should be 100 bytes. for i in range(99): result = self.storage.write_filename(key, value, test_file) self.assertEquals(result, test_file) # The 100th iteration should cause the file to rotate rolled = self.storage.write_filename(key, value, test_file) # rolled should be <test_dir>/test.log.<pid>.<timestamp><suffix> self.assertNotEqual(rolled, test_file) self.assertTrue(rolled.startswith(test_file)) self.assertTrue( rolled.endswith(StorageLayout.PENDING_COMPRESSION_SUFFIX))