Exemplo n.º 1
0
def main():
    parser = ArgumentParser(
        description='Convert local Telemetry pings to server storage structure'
    )
    parser.add_argument("--input-dir", required=True)
    parser.add_argument("--output-dir", required=True)
    parser.add_argument("--schema",
                        type=file,
                        default='./telemetry/telemetry_schema.json')
    parser.add_argument("--histogram-cache-dir",
                        default='/tmp/telemetry_histogram_cache')
    args = parser.parse_args()

    print "Getting pings from", args.input_dir, "converting them and storing them in", args.output_dir
    schema = TelemetrySchema(json.load(args.schema))
    cache = RevisionCache(args.histogram_cache_dir, 'hg.mozilla.org')
    converter = Converter(cache, schema)
    storage = StorageLayout(schema, args.output_dir, 500000000)
    ping_dir = args.input_dir
    ping_files = get_pings(ping_dir)
    if len(ping_files) == 0:
        # Try the usual ping dir (if the user just gave the Profile Dir)
        ping_dir = os.path.join(args.input_dir, "saved-telemetry-pings")
        ping_files = get_pings(ping_dir)

    print "found", len(ping_files), "pings"
    for ping_file in ping_files:
        with open(os.path.join(ping_dir, ping_file), "r") as f:
            ping = json.load(f)
            reason = ping['reason']
            key = ping['slug']
            payload = ping['payload']
            submission_date = date.today().strftime("%Y%m%d")
            dims = schema.dimensions_from(payload, submission_date)
            try:
                parsed_data, dims = converter.convert_obj(payload, dims[-1])
                serialized_data = converter.serialize(parsed_data)
                data_version = Converter.VERSION_CONVERTED
                try:
                    # Write to persistent storage
                    n = storage.write(key, serialized_data, dims, data_version)
                    print "Successfully saved ping", key, "to", n
                except Exception, e:
                    traceback.print_exc()
            except BadPayloadError, e:
                print "Bad Payload:", e.msg
            except Exception, e:
                traceback.print_exc()
def main():
    parser = ArgumentParser(description='Convert local Telemetry pings to server storage structure')
    parser.add_argument("--input-dir", required=True)
    parser.add_argument("--output-dir", required=True)
    parser.add_argument("--schema", type=file, default='./telemetry/telemetry_schema.json')
    parser.add_argument("--histogram-cache-dir", default='/tmp/telemetry_histogram_cache')
    args = parser.parse_args()

    print "Getting pings from", args.input_dir, "converting them and storing them in", args.output_dir
    schema = TelemetrySchema(json.load(args.schema))
    cache = RevisionCache(args.histogram_cache_dir, 'hg.mozilla.org')
    converter = Converter(cache, schema)
    storage = StorageLayout(schema, args.output_dir, 500000000)
    ping_dir = args.input_dir
    ping_files = get_pings(ping_dir)
    if len(ping_files) == 0:
        # Try the usual ping dir (if the user just gave the Profile Dir)
        ping_dir = os.path.join(args.input_dir, "saved-telemetry-pings")
        ping_files = get_pings(ping_dir)

    print "found", len(ping_files), "pings"
    for ping_file in ping_files:
        with open(os.path.join(ping_dir, ping_file), "r") as f:
            ping = json.load(f)
            reason = ping['reason']
            key = ping['slug']
            payload = ping['payload']
            submission_date = date.today().strftime("%Y%m%d")
            dims = schema.dimensions_from(payload, submission_date)
            try:
                parsed_data, dims = converter.convert_obj(payload, dims[-1])
                serialized_data = converter.serialize(parsed_data)
                data_version = Converter.VERSION_CONVERTED
                try:
                    # Write to persistent storage
                    n = storage.write(key, serialized_data, dims, data_version)
                    print "Successfully saved ping", key, "to", n
                except Exception, e:
                    traceback.print_exc()
            except BadPayloadError, e:
                print "Bad Payload:", e.msg
            except Exception, e:
                traceback.print_exc()
Exemplo n.º 3
0
      "allowed_values": ["c1", "c2", "c3"]
    },
    {
      "field_name": "appVersion",
      "allowed_values": "*"
    },
    {
      "field_name": "appBuildID",
     "allowed_values": "*"
    },
    {
      "field_name": "submission_date",
      "allowed_values": {
          "min": "20130101",
          "max": "20131231"
      }
    }
  ]
}

try:
    schema = TelemetrySchema(schema_spec)
    storage = StorageLayout(schema, test_dir, 10000)
    test_file_1 = os.path.join(test_dir, "test.log")
    storage.write_filename("foo", '{"bar": "baz"}', test_file_1)
    test_file_1_md5, test_file_1_size = fileutil.md5file(test_file_1)
    assert test_file_1_md5 == "206dd2d33a04802c31d2c74f10cc472b"
    assert storage.clean_newlines("ab\n\ncd\r\n") == "ab  cd  "
finally:
    shutil.rmtree(test_dir)
Exemplo n.º 4
0
def main():
    signal.signal(signal.SIGINT, handle_sigint)
    parser = argparse.ArgumentParser(
        description='Process incoming Telemetry data',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("-c",
                        "--config",
                        required=True,
                        type=file,
                        help="AWS Configuration file (json)")
    parser.add_argument("-w",
                        "--work-dir",
                        required=True,
                        help="Location to cache downloaded files")
    parser.add_argument("-o",
                        "--output-dir",
                        required=True,
                        help="Base dir to store processed data")
    parser.add_argument("-i",
                        "--input-files",
                        type=file,
                        help="File containing a list of keys to process")
    parser.add_argument("-b",
                        "--bad-data-log",
                        help="Save bad records to this file")
    parser.add_argument("-l", "--log-file", help="Log output to this file")
    parser.add_argument("-s",
                        "--stats-file",
                        help="Log statistics to this file")
    parser.add_argument("--histogram-cache-path",
                        default="./histogram_cache",
                        help="Path to store a local cache of histograms")
    parser.add_argument("-t",
                        "--telemetry-schema",
                        required=True,
                        help="Location of the desired telemetry schema")
    parser.add_argument("-m",
                        "--max-output-size",
                        metavar="N",
                        type=int,
                        default=500000000,
                        help="Rotate output files after N bytes")
    parser.add_argument("-D",
                        "--dry-run",
                        action="store_true",
                        help="Don't modify remote files")
    parser.add_argument("-n",
                        "--no-clean",
                        action="store_true",
                        help="Don't clean out the output-dir before beginning")
    parser.add_argument("-v",
                        "--verbose",
                        action="store_true",
                        help="Print more detailed output")
    args = parser.parse_args()

    if args.verbose:
        # Turn on mp logging
        multiprocessing.log_to_stderr(logging.DEBUG)

    config = json.load(args.config)
    # TODO: allow commandline args to override config values.

    if not os.path.isdir(args.output_dir):
        os.makedirs(args.output_dir)

    schema_data = open(args.telemetry_schema)
    schema = TelemetrySchema(json.load(schema_data))
    schema_data.close()
    cache = RevisionCache(args.histogram_cache_path, "hg.mozilla.org")
    converter = Converter(cache, schema)
    storage = StorageLayout(schema, args.output_dir, args.max_output_size)
    logger = Log(args.log_file, "Master")
    num_cpus = multiprocessing.cpu_count()
    conn = None
    incoming_bucket = None
    incoming_queue = None
    s3downloader = None
    raw_readers = None
    compressors = None
    exporters = None
    done = False

    if args.no_clean:
        logger.log("Not removing log files in {}".format(args.output_dir))
    else:
        # Remove existing log files from output_dir (to clean up after an
        # incomplete previous run, for example).
        logger.log("Removing log files in {}".format(args.output_dir))
        for root, dirs, files in os.walk(args.output_dir):
            for f in files:
                if f.endswith(".log"):
                    full = os.path.join(root, f)
                    if args.dry_run:
                        logger.log("Would be deleting {}, except it's a " \
                                   "dry run".format(full))
                    else:
                        try:
                            logger.log("Removing existing file: " + full)
                            os.remove(full)
                        except Exception, e:
                            logger.log("Error removing existing " \
                                       " file {}: {}".format(full, e))
def main():
    parser = argparse.ArgumentParser(
        description='Process incoming Telemetry data',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("incoming_bucket",
                        help="The S3 bucket containing incoming files")
    parser.add_argument("publish_bucket",
                        help="The S3 bucket to save processed files")
    parser.add_argument("-k", "--aws-key", help="AWS Key", required=True)
    parser.add_argument("-s",
                        "--aws-secret-key",
                        help="AWS Secret Key",
                        required=True)
    parser.add_argument("-r",
                        "--aws-region",
                        help="AWS Region",
                        default="us-west-2")
    parser.add_argument("-w",
                        "--work-dir",
                        help="Location to cache downloaded files",
                        required=True)
    parser.add_argument("-o",
                        "--output-dir",
                        help="Base dir to store processed data",
                        required=True)
    parser.add_argument("-i",
                        "--input-files",
                        help="File containing a list of keys to process",
                        type=file)
    parser.add_argument("-b",
                        "--bad-data-log",
                        help="Save bad records to this file")
    parser.add_argument("-q",
                        "--queue",
                        help="SQS Queue name to poll for incoming data")
    parser.add_argument("-c",
                        "--histogram-cache-path",
                        help="Path to store a local cache of histograms",
                        default="./histogram_cache")
    parser.add_argument("-t",
                        "--telemetry-schema",
                        help="Location of the desired telemetry schema",
                        required=True)
    parser.add_argument("-m",
                        "--max-output-size",
                        metavar="N",
                        help="Rotate output files after N bytes",
                        type=int,
                        default=500000000)
    parser.add_argument("-D",
                        "--dry-run",
                        help="Don't modify remote files",
                        action="store_true")
    parser.add_argument("-C",
                        "--skip-conversion",
                        help="Skip validation/conversion of payloads",
                        action="store_true")
    args = parser.parse_args()

    if not os.path.isfile(S3FUNNEL_PATH):
        print "ERROR: s3funnel not found at", S3FUNNEL_PATH
        print "You can get it from github: https://github.com/sstoiana/s3funnel"
        return -1

    if not os.path.isdir(args.output_dir):
        os.makedirs(args.output_dir)

    schema_data = open(args.telemetry_schema)
    schema = TelemetrySchema(json.load(schema_data))
    schema_data.close()
    cache = RevisionCache(args.histogram_cache_path, "hg.mozilla.org")
    if args.skip_conversion:
        converter = None
    else:
        converter = Converter(cache, schema)
    storage = StorageLayout(schema, args.output_dir, args.max_output_size)

    num_cpus = multiprocessing.cpu_count()

    start = datetime.now()
    conn = None
    incoming_bucket = None
    incoming_queue = None
    incoming_queue_messages = []

    if not args.dry_run:
        conn = S3Connection(args.aws_key, args.aws_secret_key)
        incoming_bucket = conn.get_bucket(args.incoming_bucket)

    incoming_filenames = []
    if args.queue is not None:
        print "Fetching file list from queue", args.queue
        if args.dry_run:
            print "Dry run mode... can't read from the queue without messing things up..."
        else:
            q_conn = boto.sqs.connect_to_region(
                args.aws_region,
                aws_access_key_id=args.aws_key,
                aws_secret_access_key=args.aws_secret_key)
            incoming_queue = q_conn.get_queue(args.queue)
            if incoming_queue is None:
                print "Error: could not get queue", args.queue
                return -2
            # Sometimes we don't get all the messages, even if more are
            # available, so keep trying until we have enough (or there aren't
            # any left)
            for i in range(num_cpus):
                messages = incoming_queue.get_messages(num_cpus -
                                                       len(incoming_filenames))
                for m in messages:
                    # TODO: Make sure this file exists in S3 first?
                    possible_filename = m.get_body()
                    key = incoming_bucket.get_key(possible_filename)
                    if key is None:
                        print "Could not find queued filename in bucket", args.incoming_bucket, ":", possible_filename
                        # try to delete it:
                        incoming_queue.delete_message(m)
                    else:
                        incoming_filenames.append(possible_filename)
                        incoming_queue_messages.append(m)
                if len(messages) == 0 or len(incoming_filenames) >= num_cpus:
                    break
    elif args.input_files:
        print "Fetching file list from file", args.input_files
        incoming_filenames = [l.strip() for l in args.input_files.readlines()]
    else:
        print "Fetching file list from S3..."
        for f in incoming_bucket.list():
            incoming_filenames.append(f.name)
    print "Done"

    if len(incoming_filenames) == 0:
        print "Nothing to do!"
        return 0

    for f in incoming_filenames:
        print "  ", f

    print "Verifying that we can write to", args.publish_bucket
    if args.dry_run:
        print "Dry run mode: don't care!"
    else:
        try:
            publish_bucket = conn.get_bucket(args.publish_bucket)
            print "Looks good!"
        except S3ResponseError:
            print "Bucket", args.publish_bucket, "not found.  Attempting to create it."
            publish_bucket = conn.create_bucket(args.publish_bucket)

    result = 0
    print "Downloading", len(incoming_filenames), "files..."
    if args.dry_run:
        print "Dry run mode: skipping download from S3"
    else:
        result = fetch_s3_files(incoming_filenames, args.work_dir,
                                incoming_bucket, args.aws_key,
                                args.aws_secret_key)

    if result != 0:
        print "Error downloading files. Return code of s3funnel was", result
        return result
    print "Done"

    after_download = datetime.now()

    local_filenames = [
        os.path.join(args.work_dir, f) for f in incoming_filenames
    ]

    # TODO: try a SimpleQueue
    raw_files = Queue()
    for l in local_filenames:
        raw_files.put(l)

    completed_files = Queue()
    compressed_files = Queue()

    # Begin reading raw input
    raw_readers = start_workers(
        num_cpus, "Reader", ReadRawStep, raw_files,
        (completed_files, schema, converter, storage, args.bad_data_log))

    # Tell readers when to stop:
    for i in range(num_cpus):
        raw_files.put(PipeStep.SENTINEL)

    # Compress completed files.
    compressors = start_workers(num_cpus, "Compressor", CompressCompletedStep,
                                completed_files, (compressed_files, ))

    # Export compressed files to S3.
    exporters = start_workers(
        num_cpus, "Exporter", ExportCompressedStep, compressed_files,
        (args.output_dir, args.aws_key, args.aws_secret_key,
         args.publish_bucket, args.dry_run))

    wait_for(raw_readers, "Raw Readers")

    # `find <out_dir> -type f -not -name ".compressme"`
    # Add them to completed_files
    for root, dirs, files in os.walk(args.output_dir):
        for f in files:
            if f.endswith(".log"):
                completed_files.put(os.path.join(root, f))

    for i in range(num_cpus):
        completed_files.put(PipeStep.SENTINEL)

    wait_for(compressors, "Compressors")
    for i in range(num_cpus):
        compressed_files.put(PipeStep.SENTINEL)

    wait_for(exporters, "Exporters")

    print "Removing processed logs from S3..."
    for f in incoming_filenames:
        if args.dry_run:
            print "  Dry run, so not really deleting", f
        else:
            print "  Deleting", f
            incoming_bucket.delete_key(f)
            # Delete file locally too.
            os.remove(os.path.join(args.work_dir, f))
    print "Done"

    if len(incoming_queue_messages) > 0:
        print "Removing processed messages from SQS..."
        for m in incoming_queue_messages:
            if args.dry_run:
                print "  Dry run, so not really deleting", m.get_body()
            else:
                print "  Deleting", m.get_body()
                if incoming_queue.delete_message(m):
                    print "  Message deleted successfully"
                else:
                    print "  Failed to delete message :("
        print "Done"

    duration = timer.delta_sec(start)
    print "All done in %.2fs (%.2fs excluding download time)" % (
        duration, timer.delta_sec(after_download))
    return 0
Exemplo n.º 6
0
def main():
    parser = argparse.ArgumentParser(
        description='Split raw logs into partitioned files.',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("-m",
                        "--max-output-size",
                        metavar="N",
                        help="Rotate output files after N bytes",
                        type=int,
                        default=500000000)
    parser.add_argument("-i",
                        "--input-file",
                        help="Filename to read from",
                        required=True)
    parser.add_argument("-o",
                        "--output-dir",
                        help="Base directory to store split files",
                        required=True)
    parser.add_argument("-t",
                        "--telemetry-schema",
                        help="Filename of telemetry schema spec",
                        required=True)
    parser.add_argument("-f",
                        "--file-version",
                        help="Log file version (if omitted, we'll guess)")
    args = parser.parse_args()

    schema_data = open(args.telemetry_schema)
    schema = TelemetrySchema(json.load(schema_data))
    schema_data.close()

    storage = StorageLayout(schema, args.output_dir, args.max_output_size)

    expected_dim_count = len(schema._dimensions)

    if not os.path.isdir(args.output_dir):
        os.makedirs(args.output_dir)

    record_count = 0
    bad_record_count = 0
    bytes_read = 0
    start = datetime.now()
    file_version = args.file_version
    if not file_version:
        file_version = fileutil.detect_file_version(args.input_file)
    for r in fileutil.unpack(args.input_file, file_version=file_version):
        record_count += 1
        if r.error:
            bad_record_count += 1
            continue
        # Incoming timestamps are in milliseconds, so convert to POSIX first
        # (ie. seconds)
        submission_date = date.fromtimestamp(r.timestamp /
                                             1000).strftime("%Y%m%d")
        # Deal with unicode
        path = unicode(r.path, errors="replace")
        data = unicode(r.data, errors="replace")

        bytes_read += r.len_ip + r.len_path + r.len_data + fileutil.RECORD_PREAMBLE_LENGTH[
            file_version]
        #print "Path for record", record_count, path, "length of data:", r.len_data, "data:", data[0:5] + "..."

        path_components = path.split("/")
        if len(path_components) != expected_dim_count:
            # We're going to pop the ID off, but we'll also add the submission
            # date, so it evens out.
            print "Found an invalid path in record", record_count, path
            bad_record_count += 1
            continue

        key = path_components.pop(0)
        info = {}
        info["reason"] = path_components.pop(0)
        info["appName"] = path_components.pop(0)
        info["appVersion"] = path_components.pop(0)
        info["appUpdateChannel"] = path_components.pop(0)
        info["appBuildID"] = path_components.pop(0)
        dimensions = schema.dimensions_from(info, submission_date)
        #print "  Converted path to filename", schema.get_filename(args.output_dir, dimensions)
        storage.write(key, data, dimensions)
    duration = timer.delta_sec(start)
    mb_read = bytes_read / 1024.0 / 1024.0
    print "Read %.2fMB in %.2fs (%.2fMB/s), %d of %d records were bad" % (
        mb_read, duration, mb_read / duration, bad_record_count, record_count)
    return 0
Exemplo n.º 7
0
 def setUp(self):
     test_dir = self.get_test_dir()
     self.schema = TelemetrySchema(self.get_schema_spec())
     self.storage = StorageLayout(self.schema, test_dir, 10000)
     assert not os.path.exists(test_dir)
     os.makedirs(test_dir)
Exemplo n.º 8
0
class TestPersist(unittest.TestCase):
    def setUp(self):
        test_dir = self.get_test_dir()
        self.schema = TelemetrySchema(self.get_schema_spec())
        self.storage = StorageLayout(self.schema, test_dir, 10000)
        assert not os.path.exists(test_dir)
        os.makedirs(test_dir)

    def tearDown(self):
        shutil.rmtree(self.get_test_dir())


    def get_test_dir(self):
        return "/tmp/test_telemetry_persist"

    def get_schema_spec(self):
        return {
            "version": 1,
            "dimensions": [
                {
                    "field_name": "reason",
                    "allowed_values": ["r1","r2"]
                },
                {
                    "field_name": "appName",
                    "allowed_values": ["a1"]
                },
                {
                    "field_name": "appUpdateChannel",
                    "allowed_values": ["c1", "c2", "c3"]
                },
                {
                    "field_name": "appVersion",
                    "allowed_values": "*"
                },
                {
                    "field_name": "appBuildID",
                   "allowed_values": "*"
                },
                {
                    "field_name": "submission_date",
                    "allowed_values": {
                          "min": "20130101",
                          "max": "20131231"
                    }
                }
            ]
        }

    def test_write_filename(self):
        test_file = os.path.join(self.get_test_dir(), "test.log")
        self.storage.write_filename("foo", '{"bar":"baz"}', test_file)
        test_file_md5, test_file_size = fileutil.md5file(test_file)
        self.assertEqual(test_file_md5, "0ea91df239ea79ed2ebab34b46d455fc")

        test_file = os.path.join(self.get_test_dir(), "test2.log")
        # Now test writing an object
        self.storage.write_filename("foo", {"bar":"baz"}, test_file)
        test_file_md5, test_file_size = fileutil.md5file(test_file)
        self.assertEqual(test_file_md5, "0ea91df239ea79ed2ebab34b46d455fc")

    def test_write(self):
        dims = ["r1", "a1", "c1", "v1", "b1", "20130102"]
        test_dir = self.get_test_dir()
        test_file = self.schema.get_filename(test_dir, dims)
        self.assertEquals(test_file, test_dir + "/r1/a1/c1/v1/b1.20130102.v1.log")

        self.storage.write("foo", '{"bar":"baz"}', dims)
        md5, size = fileutil.md5file(test_file)
        self.assertEqual(md5, "0ea91df239ea79ed2ebab34b46d455fc")

    def test_clean_newlines(self):
        self.assertEqual(self.storage.clean_newlines("ab\n\ncd\r\n"), "ab  cd  ")

    def test_rotate(self):
        test_file = os.path.join(self.get_test_dir(), "test.log")
        key = "01234567890123456789012345678901234567890123456789"
        value = '{"some filler stuff here":"fffffffffffffffffff"}'
        # each iteration should be 100 bytes.
        for i in range(99):
            result = self.storage.write_filename(key, value, test_file)
            self.assertEquals(result, test_file)

        # The 100th iteration should cause the file to rotate
        rolled = self.storage.write_filename(key, value, test_file)
        # rolled should be <test_dir>/test.log.<pid>.<timestamp><suffix>
        self.assertNotEqual(rolled, test_file)
        self.assertTrue(rolled.startswith(test_file))
        self.assertTrue(rolled.endswith(StorageLayout.PENDING_COMPRESSION_SUFFIX))
Exemplo n.º 9
0
def main():
    parser = argparse.ArgumentParser(
        description="Split raw logs into partitioned files.", formatter_class=argparse.ArgumentDefaultsHelpFormatter
    )
    parser.add_argument(
        "-m", "--max-output-size", metavar="N", help="Rotate output files after N bytes", type=int, default=500000000
    )
    parser.add_argument("-i", "--input-file", help="Filename to read from", required=True)
    parser.add_argument("-o", "--output-dir", help="Base directory to store split files", required=True)
    parser.add_argument("-t", "--telemetry-schema", help="Filename of telemetry schema spec", required=True)
    parser.add_argument("-b", "--bucket", help="S3 Bucket name")
    parser.add_argument("-k", "--aws-key", help="AWS Key")
    parser.add_argument("-s", "--aws-secret-key", help="AWS Secret Key")
    args = parser.parse_args()

    schema_data = open(args.telemetry_schema)
    schema = TelemetrySchema(json.load(schema_data))
    schema_data.close()

    storage = StorageLayout(schema, args.output_dir, args.max_output_size)

    expected_dim_count = len(schema._dimensions)

    if not os.path.isdir(args.output_dir):
        os.makedirs(args.output_dir)

    record_count = 0
    bad_record_count = 0
    bytes_read = 0
    start = datetime.now()
    for len_path, len_data, timestamp, path, data, err in fileutil.unpack(args.input_file):
        record_count += 1
        if err:
            bad_record_count += 1
            continue
        # Incoming timestamps are in milliseconds, so convert to POSIX first
        # (ie. seconds)
        submission_date = date.fromtimestamp(timestamp / 1000).strftime("%Y%m%d")
        # Deal with unicode
        path = unicode(path, errors="replace")
        data = unicode(data, errors="replace")

        bytes_read += len_path + len_data + fileutil.RECORD_PREAMBLE_LENGTH
        # print "Path for record", record_count, path, "length of data:", len_data, "data:", data[0:5] + "..."

        path_components = path.split("/")
        if len(path_components) != expected_dim_count:
            # We're going to pop the ID off, but we'll also add the submission
            # date, so it evens out.
            print "Found an invalid path in record", record_count, path
            bad_record_count += 1
            continue

        key = path_components.pop(0)
        info = {}
        info["reason"] = path_components.pop(0)
        info["appName"] = path_components.pop(0)
        info["appVersion"] = path_components.pop(0)
        info["appUpdateChannel"] = path_components.pop(0)
        info["appBuildID"] = path_components.pop(0)
        dimensions = schema.dimensions_from(info, submission_date)
        # print "  Converted path to filename", schema.get_filename(args.output_dir, dimensions)
        storage.write(key, data, dimensions)
    duration = timer.delta_sec(start)
    mb_read = bytes_read / 1024.0 / 1024.0
    print "Read %.2fMB in %.2fs (%.2fMB/s), %d of %d records were bad" % (
        mb_read,
        duration,
        mb_read / duration,
        bad_record_count,
        record_count,
    )
    return 0
Exemplo n.º 10
0
 def setUp(self):
     test_dir = self.get_test_dir()
     self.schema = TelemetrySchema(self.get_schema_spec())
     self.storage = StorageLayout(self.schema, test_dir, 10000)
     assert not os.path.exists(test_dir)
     os.makedirs(test_dir)
Exemplo n.º 11
0
class TestPersist(unittest.TestCase):
    def setUp(self):
        test_dir = self.get_test_dir()
        self.schema = TelemetrySchema(self.get_schema_spec())
        self.storage = StorageLayout(self.schema, test_dir, 10000)
        assert not os.path.exists(test_dir)
        os.makedirs(test_dir)

    def tearDown(self):
        shutil.rmtree(self.get_test_dir())

    def get_test_dir(self):
        return "/tmp/test_telemetry_persist"

    def get_schema_spec(self):
        return {
            "version":
            1,
            "dimensions": [{
                "field_name": "reason",
                "allowed_values": ["r1", "r2"]
            }, {
                "field_name": "appName",
                "allowed_values": ["a1"]
            }, {
                "field_name": "appUpdateChannel",
                "allowed_values": ["c1", "c2", "c3"]
            }, {
                "field_name": "appVersion",
                "allowed_values": "*"
            }, {
                "field_name": "appBuildID",
                "allowed_values": "*"
            }, {
                "field_name": "submission_date",
                "allowed_values": {
                    "min": "20130101",
                    "max": "20131231"
                }
            }]
        }

    def test_write_filename(self):
        test_file = os.path.join(self.get_test_dir(), "test.log")
        self.storage.write_filename("foo", '{"bar":"baz"}', test_file)
        test_file_md5, test_file_size = fileutil.md5file(test_file)
        self.assertEqual(test_file_md5, "0ea91df239ea79ed2ebab34b46d455fc")

        test_file = os.path.join(self.get_test_dir(), "test2.log")
        # Now test writing an object
        self.storage.write_filename("foo", {"bar": "baz"}, test_file)
        test_file_md5, test_file_size = fileutil.md5file(test_file)
        self.assertEqual(test_file_md5, "0ea91df239ea79ed2ebab34b46d455fc")

    def test_write(self):
        dims = ["r1", "a1", "c1", "v1", "b1", "20130102"]
        test_dir = self.get_test_dir()
        test_file = self.schema.get_filename(test_dir, dims)
        self.assertEquals(test_file,
                          test_dir + "/r1/a1/c1/v1/b1.20130102.v1.log")

        self.storage.write("foo", '{"bar":"baz"}', dims)
        md5, size = fileutil.md5file(test_file)
        self.assertEqual(md5, "0ea91df239ea79ed2ebab34b46d455fc")

    def test_clean_newlines(self):
        self.assertEqual(self.storage.clean_newlines("ab\n\ncd\r\n"),
                         "ab  cd  ")

    def test_rotate(self):
        test_file = os.path.join(self.get_test_dir(), "test.log")
        key = "01234567890123456789012345678901234567890123456789"
        value = '{"some filler stuff here":"fffffffffffffffffff"}'
        # each iteration should be 100 bytes.
        for i in range(99):
            result = self.storage.write_filename(key, value, test_file)
            self.assertEquals(result, test_file)

        # The 100th iteration should cause the file to rotate
        rolled = self.storage.write_filename(key, value, test_file)
        # rolled should be <test_dir>/test.log.<pid>.<timestamp><suffix>
        self.assertNotEqual(rolled, test_file)
        self.assertTrue(rolled.startswith(test_file))
        self.assertTrue(
            rolled.endswith(StorageLayout.PENDING_COMPRESSION_SUFFIX))