Exemplo n.º 1
0
def main():
    parser = ArgumentParser(
        description='Convert local Telemetry pings to server storage structure'
    )
    parser.add_argument("--input-dir", required=True)
    parser.add_argument("--output-dir", required=True)
    parser.add_argument("--schema",
                        type=file,
                        default='./telemetry/telemetry_schema.json')
    parser.add_argument("--histogram-cache-dir",
                        default='/tmp/telemetry_histogram_cache')
    args = parser.parse_args()

    print "Getting pings from", args.input_dir, "converting them and storing them in", args.output_dir
    schema = TelemetrySchema(json.load(args.schema))
    cache = RevisionCache(args.histogram_cache_dir, 'hg.mozilla.org')
    converter = Converter(cache, schema)
    storage = StorageLayout(schema, args.output_dir, 500000000)
    ping_dir = args.input_dir
    ping_files = get_pings(ping_dir)
    if len(ping_files) == 0:
        # Try the usual ping dir (if the user just gave the Profile Dir)
        ping_dir = os.path.join(args.input_dir, "saved-telemetry-pings")
        ping_files = get_pings(ping_dir)

    print "found", len(ping_files), "pings"
    for ping_file in ping_files:
        with open(os.path.join(ping_dir, ping_file), "r") as f:
            ping = json.load(f)
            reason = ping['reason']
            key = ping['slug']
            payload = ping['payload']
            submission_date = date.today().strftime("%Y%m%d")
            dims = schema.dimensions_from(payload, submission_date)
            try:
                parsed_data, dims = converter.convert_obj(payload, dims[-1])
                serialized_data = converter.serialize(parsed_data)
                data_version = Converter.VERSION_CONVERTED
                try:
                    # Write to persistent storage
                    n = storage.write(key, serialized_data, dims, data_version)
                    print "Successfully saved ping", key, "to", n
                except Exception, e:
                    traceback.print_exc()
            except BadPayloadError, e:
                print "Bad Payload:", e.msg
            except Exception, e:
                traceback.print_exc()
def main():
    parser = ArgumentParser(description='Convert local Telemetry pings to server storage structure')
    parser.add_argument("--input-dir", required=True)
    parser.add_argument("--output-dir", required=True)
    parser.add_argument("--schema", type=file, default='./telemetry/telemetry_schema.json')
    parser.add_argument("--histogram-cache-dir", default='/tmp/telemetry_histogram_cache')
    args = parser.parse_args()

    print "Getting pings from", args.input_dir, "converting them and storing them in", args.output_dir
    schema = TelemetrySchema(json.load(args.schema))
    cache = RevisionCache(args.histogram_cache_dir, 'hg.mozilla.org')
    converter = Converter(cache, schema)
    storage = StorageLayout(schema, args.output_dir, 500000000)
    ping_dir = args.input_dir
    ping_files = get_pings(ping_dir)
    if len(ping_files) == 0:
        # Try the usual ping dir (if the user just gave the Profile Dir)
        ping_dir = os.path.join(args.input_dir, "saved-telemetry-pings")
        ping_files = get_pings(ping_dir)

    print "found", len(ping_files), "pings"
    for ping_file in ping_files:
        with open(os.path.join(ping_dir, ping_file), "r") as f:
            ping = json.load(f)
            reason = ping['reason']
            key = ping['slug']
            payload = ping['payload']
            submission_date = date.today().strftime("%Y%m%d")
            dims = schema.dimensions_from(payload, submission_date)
            try:
                parsed_data, dims = converter.convert_obj(payload, dims[-1])
                serialized_data = converter.serialize(parsed_data)
                data_version = Converter.VERSION_CONVERTED
                try:
                    # Write to persistent storage
                    n = storage.write(key, serialized_data, dims, data_version)
                    print "Successfully saved ping", key, "to", n
                except Exception, e:
                    traceback.print_exc()
            except BadPayloadError, e:
                print "Bad Payload:", e.msg
            except Exception, e:
                traceback.print_exc()
Exemplo n.º 3
0
def main():
    parser = argparse.ArgumentParser(
        description='Split raw logs into partitioned files.',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("-m",
                        "--max-output-size",
                        metavar="N",
                        help="Rotate output files after N bytes",
                        type=int,
                        default=500000000)
    parser.add_argument("-i",
                        "--input-file",
                        help="Filename to read from",
                        required=True)
    parser.add_argument("-o",
                        "--output-dir",
                        help="Base directory to store split files",
                        required=True)
    parser.add_argument("-t",
                        "--telemetry-schema",
                        help="Filename of telemetry schema spec",
                        required=True)
    parser.add_argument("-f",
                        "--file-version",
                        help="Log file version (if omitted, we'll guess)")
    args = parser.parse_args()

    schema_data = open(args.telemetry_schema)
    schema = TelemetrySchema(json.load(schema_data))
    schema_data.close()

    storage = StorageLayout(schema, args.output_dir, args.max_output_size)

    expected_dim_count = len(schema._dimensions)

    if not os.path.isdir(args.output_dir):
        os.makedirs(args.output_dir)

    record_count = 0
    bad_record_count = 0
    bytes_read = 0
    start = datetime.now()
    file_version = args.file_version
    if not file_version:
        file_version = fileutil.detect_file_version(args.input_file)
    for r in fileutil.unpack(args.input_file, file_version=file_version):
        record_count += 1
        if r.error:
            bad_record_count += 1
            continue
        # Incoming timestamps are in milliseconds, so convert to POSIX first
        # (ie. seconds)
        submission_date = date.fromtimestamp(r.timestamp /
                                             1000).strftime("%Y%m%d")
        # Deal with unicode
        path = unicode(r.path, errors="replace")
        data = unicode(r.data, errors="replace")

        bytes_read += r.len_ip + r.len_path + r.len_data + fileutil.RECORD_PREAMBLE_LENGTH[
            file_version]
        #print "Path for record", record_count, path, "length of data:", r.len_data, "data:", data[0:5] + "..."

        path_components = path.split("/")
        if len(path_components) != expected_dim_count:
            # We're going to pop the ID off, but we'll also add the submission
            # date, so it evens out.
            print "Found an invalid path in record", record_count, path
            bad_record_count += 1
            continue

        key = path_components.pop(0)
        info = {}
        info["reason"] = path_components.pop(0)
        info["appName"] = path_components.pop(0)
        info["appVersion"] = path_components.pop(0)
        info["appUpdateChannel"] = path_components.pop(0)
        info["appBuildID"] = path_components.pop(0)
        dimensions = schema.dimensions_from(info, submission_date)
        #print "  Converted path to filename", schema.get_filename(args.output_dir, dimensions)
        storage.write(key, data, dimensions)
    duration = timer.delta_sec(start)
    mb_read = bytes_read / 1024.0 / 1024.0
    print "Read %.2fMB in %.2fs (%.2fMB/s), %d of %d records were bad" % (
        mb_read, duration, mb_read / duration, bad_record_count, record_count)
    return 0
Exemplo n.º 4
0
class TestPersist(unittest.TestCase):
    def setUp(self):
        test_dir = self.get_test_dir()
        self.schema = TelemetrySchema(self.get_schema_spec())
        self.storage = StorageLayout(self.schema, test_dir, 10000)
        assert not os.path.exists(test_dir)
        os.makedirs(test_dir)

    def tearDown(self):
        shutil.rmtree(self.get_test_dir())


    def get_test_dir(self):
        return "/tmp/test_telemetry_persist"

    def get_schema_spec(self):
        return {
            "version": 1,
            "dimensions": [
                {
                    "field_name": "reason",
                    "allowed_values": ["r1","r2"]
                },
                {
                    "field_name": "appName",
                    "allowed_values": ["a1"]
                },
                {
                    "field_name": "appUpdateChannel",
                    "allowed_values": ["c1", "c2", "c3"]
                },
                {
                    "field_name": "appVersion",
                    "allowed_values": "*"
                },
                {
                    "field_name": "appBuildID",
                   "allowed_values": "*"
                },
                {
                    "field_name": "submission_date",
                    "allowed_values": {
                          "min": "20130101",
                          "max": "20131231"
                    }
                }
            ]
        }

    def test_write_filename(self):
        test_file = os.path.join(self.get_test_dir(), "test.log")
        self.storage.write_filename("foo", '{"bar":"baz"}', test_file)
        test_file_md5, test_file_size = fileutil.md5file(test_file)
        self.assertEqual(test_file_md5, "0ea91df239ea79ed2ebab34b46d455fc")

        test_file = os.path.join(self.get_test_dir(), "test2.log")
        # Now test writing an object
        self.storage.write_filename("foo", {"bar":"baz"}, test_file)
        test_file_md5, test_file_size = fileutil.md5file(test_file)
        self.assertEqual(test_file_md5, "0ea91df239ea79ed2ebab34b46d455fc")

    def test_write(self):
        dims = ["r1", "a1", "c1", "v1", "b1", "20130102"]
        test_dir = self.get_test_dir()
        test_file = self.schema.get_filename(test_dir, dims)
        self.assertEquals(test_file, test_dir + "/r1/a1/c1/v1/b1.20130102.v1.log")

        self.storage.write("foo", '{"bar":"baz"}', dims)
        md5, size = fileutil.md5file(test_file)
        self.assertEqual(md5, "0ea91df239ea79ed2ebab34b46d455fc")

    def test_clean_newlines(self):
        self.assertEqual(self.storage.clean_newlines("ab\n\ncd\r\n"), "ab  cd  ")

    def test_rotate(self):
        test_file = os.path.join(self.get_test_dir(), "test.log")
        key = "01234567890123456789012345678901234567890123456789"
        value = '{"some filler stuff here":"fffffffffffffffffff"}'
        # each iteration should be 100 bytes.
        for i in range(99):
            result = self.storage.write_filename(key, value, test_file)
            self.assertEquals(result, test_file)

        # The 100th iteration should cause the file to rotate
        rolled = self.storage.write_filename(key, value, test_file)
        # rolled should be <test_dir>/test.log.<pid>.<timestamp><suffix>
        self.assertNotEqual(rolled, test_file)
        self.assertTrue(rolled.startswith(test_file))
        self.assertTrue(rolled.endswith(StorageLayout.PENDING_COMPRESSION_SUFFIX))
Exemplo n.º 5
0
def main():
    parser = argparse.ArgumentParser(
        description="Split raw logs into partitioned files.", formatter_class=argparse.ArgumentDefaultsHelpFormatter
    )
    parser.add_argument(
        "-m", "--max-output-size", metavar="N", help="Rotate output files after N bytes", type=int, default=500000000
    )
    parser.add_argument("-i", "--input-file", help="Filename to read from", required=True)
    parser.add_argument("-o", "--output-dir", help="Base directory to store split files", required=True)
    parser.add_argument("-t", "--telemetry-schema", help="Filename of telemetry schema spec", required=True)
    parser.add_argument("-b", "--bucket", help="S3 Bucket name")
    parser.add_argument("-k", "--aws-key", help="AWS Key")
    parser.add_argument("-s", "--aws-secret-key", help="AWS Secret Key")
    args = parser.parse_args()

    schema_data = open(args.telemetry_schema)
    schema = TelemetrySchema(json.load(schema_data))
    schema_data.close()

    storage = StorageLayout(schema, args.output_dir, args.max_output_size)

    expected_dim_count = len(schema._dimensions)

    if not os.path.isdir(args.output_dir):
        os.makedirs(args.output_dir)

    record_count = 0
    bad_record_count = 0
    bytes_read = 0
    start = datetime.now()
    for len_path, len_data, timestamp, path, data, err in fileutil.unpack(args.input_file):
        record_count += 1
        if err:
            bad_record_count += 1
            continue
        # Incoming timestamps are in milliseconds, so convert to POSIX first
        # (ie. seconds)
        submission_date = date.fromtimestamp(timestamp / 1000).strftime("%Y%m%d")
        # Deal with unicode
        path = unicode(path, errors="replace")
        data = unicode(data, errors="replace")

        bytes_read += len_path + len_data + fileutil.RECORD_PREAMBLE_LENGTH
        # print "Path for record", record_count, path, "length of data:", len_data, "data:", data[0:5] + "..."

        path_components = path.split("/")
        if len(path_components) != expected_dim_count:
            # We're going to pop the ID off, but we'll also add the submission
            # date, so it evens out.
            print "Found an invalid path in record", record_count, path
            bad_record_count += 1
            continue

        key = path_components.pop(0)
        info = {}
        info["reason"] = path_components.pop(0)
        info["appName"] = path_components.pop(0)
        info["appVersion"] = path_components.pop(0)
        info["appUpdateChannel"] = path_components.pop(0)
        info["appBuildID"] = path_components.pop(0)
        dimensions = schema.dimensions_from(info, submission_date)
        # print "  Converted path to filename", schema.get_filename(args.output_dir, dimensions)
        storage.write(key, data, dimensions)
    duration = timer.delta_sec(start)
    mb_read = bytes_read / 1024.0 / 1024.0
    print "Read %.2fMB in %.2fs (%.2fMB/s), %d of %d records were bad" % (
        mb_read,
        duration,
        mb_read / duration,
        bad_record_count,
        record_count,
    )
    return 0
Exemplo n.º 6
0
class TestPersist(unittest.TestCase):
    def setUp(self):
        test_dir = self.get_test_dir()
        self.schema = TelemetrySchema(self.get_schema_spec())
        self.storage = StorageLayout(self.schema, test_dir, 10000)
        assert not os.path.exists(test_dir)
        os.makedirs(test_dir)

    def tearDown(self):
        shutil.rmtree(self.get_test_dir())

    def get_test_dir(self):
        return "/tmp/test_telemetry_persist"

    def get_schema_spec(self):
        return {
            "version":
            1,
            "dimensions": [{
                "field_name": "reason",
                "allowed_values": ["r1", "r2"]
            }, {
                "field_name": "appName",
                "allowed_values": ["a1"]
            }, {
                "field_name": "appUpdateChannel",
                "allowed_values": ["c1", "c2", "c3"]
            }, {
                "field_name": "appVersion",
                "allowed_values": "*"
            }, {
                "field_name": "appBuildID",
                "allowed_values": "*"
            }, {
                "field_name": "submission_date",
                "allowed_values": {
                    "min": "20130101",
                    "max": "20131231"
                }
            }]
        }

    def test_write_filename(self):
        test_file = os.path.join(self.get_test_dir(), "test.log")
        self.storage.write_filename("foo", '{"bar":"baz"}', test_file)
        test_file_md5, test_file_size = fileutil.md5file(test_file)
        self.assertEqual(test_file_md5, "0ea91df239ea79ed2ebab34b46d455fc")

        test_file = os.path.join(self.get_test_dir(), "test2.log")
        # Now test writing an object
        self.storage.write_filename("foo", {"bar": "baz"}, test_file)
        test_file_md5, test_file_size = fileutil.md5file(test_file)
        self.assertEqual(test_file_md5, "0ea91df239ea79ed2ebab34b46d455fc")

    def test_write(self):
        dims = ["r1", "a1", "c1", "v1", "b1", "20130102"]
        test_dir = self.get_test_dir()
        test_file = self.schema.get_filename(test_dir, dims)
        self.assertEquals(test_file,
                          test_dir + "/r1/a1/c1/v1/b1.20130102.v1.log")

        self.storage.write("foo", '{"bar":"baz"}', dims)
        md5, size = fileutil.md5file(test_file)
        self.assertEqual(md5, "0ea91df239ea79ed2ebab34b46d455fc")

    def test_clean_newlines(self):
        self.assertEqual(self.storage.clean_newlines("ab\n\ncd\r\n"),
                         "ab  cd  ")

    def test_rotate(self):
        test_file = os.path.join(self.get_test_dir(), "test.log")
        key = "01234567890123456789012345678901234567890123456789"
        value = '{"some filler stuff here":"fffffffffffffffffff"}'
        # each iteration should be 100 bytes.
        for i in range(99):
            result = self.storage.write_filename(key, value, test_file)
            self.assertEquals(result, test_file)

        # The 100th iteration should cause the file to rotate
        rolled = self.storage.write_filename(key, value, test_file)
        # rolled should be <test_dir>/test.log.<pid>.<timestamp><suffix>
        self.assertNotEqual(rolled, test_file)
        self.assertTrue(rolled.startswith(test_file))
        self.assertTrue(
            rolled.endswith(StorageLayout.PENDING_COMPRESSION_SUFFIX))