def main(argv=None): parser = argparse.ArgumentParser(description="Convert Telemetry data") parser.add_argument("-c", "--config-file", help="Read configuration from this file", default="./telemetry_server_config.json") parser.add_argument("-d", "--date", help="Use specified date for dimensions") args = parser.parse_args() try: server_config = open(args.config_file, "r") config = json.load(server_config) server_config.close() except IOError: config = {} cache_dir = config.get("revision_cache_path", "./histogram_cache") server = config.get("revision_cache_server", "hg.mozilla.org") schema_filename = config.get("schema_filename", "./telemetry_schema.json") schema_data = open(schema_filename) schema = TelemetrySchema(json.load(schema_data)) schema_data.close() cache = revision_cache.RevisionCache(cache_dir, server) converter = Converter(cache, schema) process(converter, args.date)
def __init__(self, config): # Sanity check args. if config.num_mappers <= 0: raise ValueError("Number of mappers must be greater than zero") if config.num_reducers <= 0: raise ValueError("Number of reducers must be greater than zero") if not os.path.isdir(config.data_dir): raise ValueError("Data dir must be a valid directory") if not os.path.isdir(config.work_dir): raise ValueError("Work dir must be a valid directory") if not os.path.isfile(config.job_script): raise ValueError("Job script must be a valid python file") if not os.path.isfile(config.input_filter): raise ValueError("Input filter must be a valid json file") self._input_dir = config.data_dir if self._input_dir[-1] == os.path.sep: self._input_dir = self._input_dir[0:-1] self._work_dir = config.work_dir self._input_filter = TelemetrySchema( json.load(open(config.input_filter))) self._allowed_values = self._input_filter.sanitize_allowed_values() self._output_file = config.output self._num_mappers = config.num_mappers self._num_reducers = config.num_reducers self._local_only = config.local_only self._bucket_name = config.bucket self._aws_key = config.aws_key self._aws_secret_key = config.aws_secret_key modulefd = open(config.job_script) ## Lifted from FileDriver.py in jydoop. self._job_module = imp.load_module("telemetry_job", modulefd, config.job_script, ('.py', 'U', 1))
def setUpClass(cls): cls.cache_dir = "/tmp/histogram_revision_cache" cls.schema_filename = "./telemetry/telemetry_schema.json" assert not os.path.exists(cls.cache_dir) schema_file = open(cls.schema_filename, "r") cls.schema = TelemetrySchema(json.load(schema_file)) schema_file.close() cls.cache = revision_cache.RevisionCache(cls.cache_dir, 'hg.mozilla.org') cls.converter = Converter(cls.cache, cls.schema)
def test_more_allowed(self): spec = { "version": 1, "dimensions": [{ "field_name": "reason", "allowed_values": ["saved-session"] }, { "field_name": "appName", "allowed_values": "*" }, { "field_name": "appUpdateChannel", "allowed_values": ["nightly"] }, { "field_name": "appVersion", "allowed_values": "*" }, { "field_name": "appBuildID", "allowed_values": "one_specific_build" }, { "field_name": "submission_date", "allowed_values": { "min": "20130908", "max": "20140401" } }] } schema = TelemetrySchema(spec) allowed = schema.sanitize_allowed_values() self.assertTrue(schema.is_allowed("20130908", allowed[5])) self.assertTrue(schema.is_allowed("20140401", allowed[5])) self.assertTrue(schema.is_allowed("20130909", allowed[5])) self.assertTrue(schema.is_allowed("20140101", allowed[5])) self.assertFalse(schema.is_allowed("20130907", allowed[5])) self.assertFalse(schema.is_allowed("20000000", allowed[5])) self.assertFalse(schema.is_allowed("20140402", allowed[5])) self.assertFalse(schema.is_allowed("99999999", allowed[5])) self.assertTrue(schema.is_allowed("one_specific_build", allowed[4])) self.assertFalse(schema.is_allowed("two_specific_build", allowed[4])) self.assertFalse(schema.is_allowed("*", allowed[4])) self.assertFalse(schema.is_allowed("one_specific_build ", allowed[4])) self.assertFalse(schema.is_allowed("one-specific-build", allowed[4]))
def setUp(self): self.schema = TelemetrySchema(self.get_schema_spec()) self.allowed_values = self.schema.sanitize_allowed_values()
def main(): parser = argparse.ArgumentParser( description='Split raw logs into partitioned files.', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("-m", "--max-output-size", metavar="N", help="Rotate output files after N bytes", type=int, default=500000000) parser.add_argument("-i", "--input-file", help="Filename to read from", required=True) parser.add_argument("-o", "--output-dir", help="Base directory to store split files", required=True) parser.add_argument("-t", "--telemetry-schema", help="Filename of telemetry schema spec", required=True) parser.add_argument("-b", "--bucket", help="S3 Bucket name") parser.add_argument("-k", "--aws-key", help="AWS Key") parser.add_argument("-s", "--aws-secret-key", help="AWS Secret Key") args = parser.parse_args() schema_data = open(args.telemetry_schema) schema = TelemetrySchema(json.load(schema_data)) schema_data.close() storage = StorageLayout(schema, args.output_dir, args.max_output_size) expected_dim_count = len(schema._dimensions) if not os.path.isdir(args.output_dir): os.makedirs(args.output_dir) record_count = 0 fin = open(args.input_file, "rb") bytes_read = 0 start = datetime.now() while True: record_count += 1 # Read two 4-byte values and one 8-byte value lengths = fin.read(16) if lengths == '': break len_path, len_data, timestamp = struct.unpack("<IIQ", lengths) # Incoming timestamps are in milliseconds, so convert to POSIX first # (ie. seconds) submission_date = date.fromtimestamp(timestamp / 1000).strftime("%Y%m%d") path = unicode(fin.read(len_path), errors="replace") #print "Path for record", record_count, path, "length of data:", len_data # Detect and handle gzipped data. data = fin.read(len_data) try: # Note: from brief testing, cStringIO doesn't appear to be any # faster. In fact, it seems slightly slower than StringIO. data_reader = StringIO.StringIO(data) uncompressor = gzip.GzipFile(fileobj=data_reader, mode="r") data = unicode(uncompressor.read(), errors="replace") uncompressor.close() data_reader.close() except Exception, e: #print e # Use the string as-is data = unicode(data, errors="replace") bytes_read += 8 + len_path + len_data #print "Path for record", record_count, path, "length of data:", len_data, "data:", data[0:5] + "..." path_components = path.split("/") if len(path_components) != expected_dim_count: # We're going to pop the ID off, but we'll also add the submission, # so it evens out. print "Found an invalid path in record", record_count, path continue key = path_components.pop(0) info = {} info["reason"] = path_components.pop(0) info["appName"] = path_components.pop(0) info["appVersion"] = path_components.pop(0) info["appUpdateChannel"] = path_components.pop(0) info["appBuildID"] = path_components.pop(0) dimensions = schema.dimensions_from(info, submission_date) #print " Converted path to filename", schema.get_filename(args.output_dir, dimensions) storage.write(key, data, dimensions)