Exemplo n.º 1
0
 def test_unicode(self):
     with open('test/unicode_payload.json') as f:
         payload = f.read()
     upayload = fu.to_unicode(payload)
     converted, dimensions = ConvertTest.converter.convert_json(upayload,'20140101')
     parsed = json.loads(payload)
     self.assertEqual(parsed["data"], converted["data"])
Exemplo n.º 2
0
 def test_unicode(self):
     with open('test/unicode_payload.json') as f:
         payload = f.read()
     upayload = fu.to_unicode(payload)
     converted, dimensions = ConvertTest.converter.convert_json(
         upayload, '20140101')
     parsed = json.loads(payload)
     self.assertEqual(parsed["data"], converted["data"])
    def handle(self, raw_file):
        self.log("Reading " + raw_file)
        try:
            record_count = 0
            bytes_read = 0
            start = now()
            file_version = fileutil.detect_file_version(raw_file, simple_detection=True)
            self.log("Detected version {0} for file {1}".format(file_version,
                     raw_file))
            for unpacked in fileutil.unpack(raw_file, file_version=file_version):
                record_count += 1
                common_bytes = unpacked.len_path + fileutil.RECORD_PREAMBLE_LENGTH[file_version]
                current_bytes = common_bytes + unpacked.len_data
                current_bytes_uncompressed = common_bytes + len(unpacked.data)
                bytes_read += current_bytes
                if unpacked.error:
                    self.log("ERROR: Found corrupted data for record {0} in " \
                             "{1} path: {2} Error: {3}".format(record_count,
                                 raw_file, unpacked.path, unpacked.error))
                    self.stats.increment(records_read=1,
                            bytes_read=current_bytes,
                            bytes_uncompressed=current_bytes_uncompressed,
                            bad_records=1, bad_record_type="corrupted_data")
                    continue
                if len(unpacked.data) == 0:
                    self.log("WARN: Found empty data for record {0} in " \
                             "{2} path: {2}".format(record_count, raw_file,
                                 unpacked.path))
                    self.stats.increment(records_read=1,
                            bytes_read=current_bytes,
                            bytes_uncompressed=current_bytes_uncompressed,
                            bad_records=1, bad_record_type="empty_data")
                    continue

                submission_date = ts_to_yyyymmdd(unpacked.timestamp)
                path = fileutil.to_unicode(unpacked.path)

                if unpacked.data[0] != "{":
                    # Data looks weird, should be JSON.
                    self.log("Warning: Found unexpected data for record {0}" \
                             " in {1} path: {2} data:\n{3}".format(record_count,
                                 raw_file, path, unpacked.data))
                else:
                    # Raw JSON, make sure we treat it as unicode.
                    unpacked.data = fileutil.to_unicode(unpacked.data)

                path_components = path.split("/")
                if len(path_components) != self.expected_dim_count:
                    # We're going to pop the ID off, but we'll also add the
                    # submission date, so it evens out.
                    bad_record_type = "invalid_path"
                    if ReadRawStep.UUID_ONLY_PATH.match(path):
                        bad_record_type = "uuid_only_path"
                    else:
                        self.log("Found an invalid path in record {0}: " \
                             "{1}".format(record_count, path))
                    self.stats.increment(records_read=1,
                            bytes_read=current_bytes,
                            bytes_uncompressed=current_bytes_uncompressed,
                            bad_records=1, bad_record_type=bad_record_type)
                    continue

                key = path_components.pop(0)
                info = {}
                info["reason"] = path_components.pop(0)
                info["appName"] = path_components.pop(0)
                info["appVersion"] = path_components.pop(0)
                info["appUpdateChannel"] = path_components.pop(0)
                info["appBuildID"] = path_components.pop(0)
                dims = self.schema.dimensions_from(info, submission_date)
                channel = self.schema.get_field(dims, "appUpdateChannel",
                        True, True)

                self.stats.increment(channel=channel, records_read=1,
                        bytes_read=current_bytes,
                        bytes_uncompressed=current_bytes_uncompressed)

                try:
                    # Convert data:
                    if self.converter is None:
                        serialized_data = unpacked.data
                        # TODO: Converter.VERSION_UNCONVERTED
                        data_version = 1
                    else:
                        parsed_data, parsed_dims = self.converter.convert_json(
                                unpacked.data, dims[-1], unpacked.ip)
                        # TODO: take this out if it's too slow
                        for i in range(len(dims)):
                            if dims[i] != parsed_dims[i]:
                                self.log("Record {0} mismatched dimension " \
                                         "{1}: '{2}' != '{3}'".format(
                                            record_count, i, dims[i],
                                            parsed_dims[i]))
                        serialized_data = self.converter.serialize(parsed_data)
                        dims = parsed_dims
                        # TODO: Converter.VERSION_CONVERTED
                        data_version = 2
                    try:
                        # Write to persistent storage
                        n = self.storage.write(key, serialized_data, dims,
                                data_version)
                        self.stats.increment(channel=channel, records_written=1,
                            bytes_written=len(key) + len(serialized_data) + 2)
                        # Compress rotated files as we generate them
                        if n.endswith(StorageLayout.PENDING_COMPRESSION_SUFFIX):
                            self.q_out.put(n)
                    except Exception, e:
                        self.write_bad_record(key, dims, serialized_data,
                                str(e), "ERROR Writing to output file:",
                                "write_failed")
                except BadPayloadError, e:
                    self.write_bad_record(key, dims, unpacked.data, e.msg,
                            "Bad Payload:", "bad_payload")
                except Exception, e:
                    err_message = str(e)
                    if err_message == "Missing in payload: info.revision":
                        # We don't need to write these bad records out - we know
                        # why they are being skipped.
                        self.stats.increment(channel=channel, bad_records=1,
                                bad_record_type="missing_revision")
                    elif err_message == "Invalid revision URL: /rev/":
                        # We do want to log these payloads, but we don't want
                        # the full stack trace.
                        self.write_bad_record(key, dims, unpacked.data, err_message,
                                "Conversion Error", "missing_revision_repo")
                    # Don't split this long string - we want to be able to find it in the code
                    elif err_message.startswith("JSONDecodeError: Invalid control character"):
                        self.write_bad_record(key, dims, unpacked.data, err_message,
                                "Conversion Error", "invalid_control_char")
                    else:
                        # TODO: Recognize other common failure modes and handle
                        #       them gracefully.
                        self.write_bad_record(key, dims, unpacked.data, err_message,
                                "Conversion Error", "conversion_error")
                        self.log(traceback.format_exc())
Exemplo n.º 4
0
    def test_utf8(self):
        count = 0
        # This packed file contains the same record twice, once gzipped, once
        # raw. The record contains the UTF-8 encoded string "Wikipédia"
        for r in fu.unpack('test/unicode.v1.packed', file_version="v1"):
            self.assertIs(r.error, None)
            self.assertTrue(len(r.data) > 0)
            self.assertEqual(r.data[0], '{')
            count += 1

            # self.print_byte_range(r.data, 3323, 3362)

            # Make sure that the raw bytes are correct:
            # Incoming is UTF-8, so we expect
            #   W i k i p 0xc3 0xa9 d i a
            self.assertEqual(ord(r.data[0xd16]), 0x70)
            self.assertEqual(ord(r.data[0xd17]), 0xc3)
            self.assertEqual(ord(r.data[0xd18]), 0xa9)
            self.assertEqual(ord(r.data[0xd19]), 0x64)

            # Convert the data the wrong way:
            bad = unicode(r.data, errors="replace")
            # self.print_byte_range(bad, 3323, 3362)

            # Verify that we see the replacement chars in the expected places:
            #   W i k i p 0xfffd 0xfffd d i a
            self.assertEqual(ord(bad[0xd16]), 0x70)
            self.assertEqual(ord(bad[0xd17]), 0xfffd)
            self.assertEqual(ord(bad[0xd18]), 0xfffd)
            self.assertEqual(ord(bad[0xd19]), 0x64)

            # Now convert properly:
            good = fu.to_unicode(r.data)
            # self.print_byte_range(good, 3322, 3360)

            # Now we have unicode, so we expect
            #   W i k i p 0xe9 d i a
            self.assertEqual(ord(good[0xd15]), 0x70)
            self.assertEqual(ord(good[0xd16]), 0xe9)
            self.assertEqual(ord(good[0xd17]), 0x64)

            converted, dimensions = ConvertTest.converter.convert_json(
                good, "20131114", None)

            engine = converted["simpleMeasurements"]["UITelemetry"][
                "toolbars"]["currentSearchEngine"]
            # print engine

            self.assertEqual(ord(engine[4]), 0x70)
            self.assertEqual(ord(engine[5]), 0xe9)
            self.assertEqual(ord(engine[6]), 0x64)

            serialized = ConvertTest.converter.serialize(converted,
                                                         sort_keys=True)

            # self.print_byte_range(serialized, 4007, 4049)
            # Now we have escaped unicode, so we expect
            #   W i k i p \ u 0 0 e 9 d i a
            self.assertEqual(ord(serialized[0xfc2]), 0x70)
            self.assertEqual(ord(serialized[0xfc3]), 0x5c)
            self.assertEqual(ord(serialized[0xfc4]), 0x75)
            self.assertEqual(ord(serialized[0xfc5]), 0x30)
            self.assertEqual(ord(serialized[0xfc6]), 0x30)
            self.assertEqual(ord(serialized[0xfc7]), 0x65)
            self.assertEqual(ord(serialized[0xfc8]), 0x39)
            self.assertEqual(ord(serialized[0xfc9]), 0x64)

        self.assertEqual(count, 2)
    def handle(self, raw_file):
        print self.label, "reading", raw_file
        try:
            record_count = 0
            bytes_read = 0
            start = datetime.now()
            for len_path, len_data, timestamp, path, data, err in fileutil.unpack(
                    raw_file):
                record_count += 1
                self.records_read += 1
                if err:
                    print self.label, "ERROR: Found corrupted data for record", record_count, "in", raw_file, "path:", path, "Error:", err
                    self.bad_records += 1
                    continue
                if len(data) == 0:
                    print self.label, "ERROR: Found empty data for record", record_count, "in", raw_file, "path:", path
                    self.bad_records += 1
                    continue

                # Incoming timestamps are in milliseconds, so convert to POSIX first
                # (ie. seconds)
                submission_date = date.fromtimestamp(timestamp /
                                                     1000).strftime("%Y%m%d")
                path = fileutil.to_unicode(path)
                #print "Path for record", record_count, path, "length of data:", len_data

                if data[0] != "{":
                    # Data looks weird, should be JSON.
                    print self.label, "Warning: Found unexpected data for record", record_count, "in", raw_file, "path:", path, "data:"
                    print data
                else:
                    # Raw JSON, make sure we treat it as unicode.
                    data = fileutil.to_unicode(data)

                current_bytes = len_path + len_data + fileutil.RECORD_PREAMBLE_LENGTH[
                    "v1"]
                bytes_read += current_bytes
                self.bytes_read += current_bytes
                path_components = path.split("/")
                if len(path_components) != self.expected_dim_count:
                    # We're going to pop the ID off, but we'll also add the
                    # submission date, so it evens out.
                    print self.label, "Found an invalid path in record", record_count, path
                    continue

                key = path_components.pop(0)
                info = {}
                info["reason"] = path_components.pop(0)
                info["appName"] = path_components.pop(0)
                info["appVersion"] = path_components.pop(0)
                info["appUpdateChannel"] = path_components.pop(0)
                info["appBuildID"] = path_components.pop(0)
                dims = self.schema.dimensions_from(info, submission_date)

                try:
                    # Convert data:
                    if self.converter is None:
                        serialized_data = data
                        data_version = 1
                    else:
                        parsed_data, parsed_dims = self.converter.convert_json(
                            data, dims[-1])
                        # TODO: take this out if it's too slow
                        for i in range(len(dims)):
                            if dims[i] != parsed_dims[i]:
                                print self.label, "Record", self.records_read, "mismatched dimension", i, dims[
                                    i], "!=", parsed_dims[i]
                        serialized_data = self.converter.serialize(parsed_data)
                        dims = parsed_dims
                        data_version = 2
                    try:
                        # Write to persistent storage
                        n = self.storage.write(key, serialized_data, dims,
                                               data_version)
                        self.bytes_written += len(key) + len(
                            serialized_data) + 1
                        self.records_written += 1
                        # Compress rotated files as we generate them
                        if n.endswith(
                                StorageLayout.PENDING_COMPRESSION_SUFFIX):
                            self.q_out.put(n)
                    except Exception, e:
                        self.write_bad_record(key, dims, serialized_data,
                                              str(e),
                                              "ERROR Writing to output file:")
                except BadPayloadError, e:
                    self.write_bad_record(key, dims, data, e.msg,
                                          "Bad Payload:")
                except Exception, e:
                    err_message = str(e)

                    # We don't need to write these bad records out - we know
                    # why they are being skipped.
                    if err_message != "Missing in payload: info.revision":
                        # TODO: recognize other common failure modes and handle them gracefully.
                        self.write_bad_record(key, dims, data, err_message,
                                              "Conversion Error:")
                        traceback.print_exc()
Exemplo n.º 6
0
    def test_utf8(self):
        count = 0
        # This packed file contains the same record twice, once gzipped, once
        # raw. The record contains the UTF-8 encoded string "Wikipédia"
        for r in fu.unpack('test/unicode.v1.packed', file_version="v1"):
            self.assertIs(r.error, None)
            self.assertTrue(len(r.data) > 0)
            self.assertEqual(r.data[0], '{')
            count += 1

            # self.print_byte_range(r.data, 3323, 3362)

            # Make sure that the raw bytes are correct:
            # Incoming is UTF-8, so we expect
            #   W i k i p 0xc3 0xa9 d i a
            self.assertEqual(ord(r.data[0xd16]), 0x70)
            self.assertEqual(ord(r.data[0xd17]), 0xc3)
            self.assertEqual(ord(r.data[0xd18]), 0xa9)
            self.assertEqual(ord(r.data[0xd19]), 0x64)

            # Convert the data the wrong way:
            bad = unicode(r.data, errors="replace")
            # self.print_byte_range(bad, 3323, 3362)

            # Verify that we see the replacement chars in the expected places:
            #   W i k i p 0xfffd 0xfffd d i a
            self.assertEqual(ord(bad[0xd16]), 0x70)
            self.assertEqual(ord(bad[0xd17]), 0xfffd)
            self.assertEqual(ord(bad[0xd18]), 0xfffd)
            self.assertEqual(ord(bad[0xd19]), 0x64)

            # Now convert properly:
            good = fu.to_unicode(r.data)
            # self.print_byte_range(good, 3322, 3360)

            # Now we have unicode, so we expect
            #   W i k i p 0xe9 d i a
            self.assertEqual(ord(good[0xd15]), 0x70)
            self.assertEqual(ord(good[0xd16]), 0xe9)
            self.assertEqual(ord(good[0xd17]), 0x64)

            converted, dimensions = ConvertTest.converter.convert_json(good, "20131114", None)

            engine = converted["simpleMeasurements"]["UITelemetry"]["toolbars"]["currentSearchEngine"]
            # print engine

            self.assertEqual(ord(engine[4]), 0x70)
            self.assertEqual(ord(engine[5]), 0xe9)
            self.assertEqual(ord(engine[6]), 0x64)

            serialized = ConvertTest.converter.serialize(converted, sort_keys=True)

            # self.print_byte_range(serialized, 4007, 4049)
            # Now we have escaped unicode, so we expect
            #   W i k i p \ u 0 0 e 9 d i a
            self.assertEqual(ord(serialized[0xfc2]), 0x70)
            self.assertEqual(ord(serialized[0xfc3]), 0x5c)
            self.assertEqual(ord(serialized[0xfc4]), 0x75)
            self.assertEqual(ord(serialized[0xfc5]), 0x30)
            self.assertEqual(ord(serialized[0xfc6]), 0x30)
            self.assertEqual(ord(serialized[0xfc7]), 0x65)
            self.assertEqual(ord(serialized[0xfc8]), 0x39)
            self.assertEqual(ord(serialized[0xfc9]), 0x64)

        self.assertEqual(count, 2)
    def handle(self, raw_file):
        print self.label, "reading", raw_file
        try:
            record_count = 0
            bytes_read = 0
            start = datetime.now()
            for len_path, len_data, timestamp, path, data, err in fileutil.unpack(raw_file):
                record_count += 1
                self.records_read += 1
                if err:
                    print self.label, "ERROR: Found corrupted data for record", record_count, "in", raw_file, "path:", path, "Error:", err
                    self.bad_records += 1
                    continue
                if len(data) == 0:
                    print self.label, "ERROR: Found empty data for record", record_count, "in", raw_file, "path:", path
                    self.bad_records += 1
                    continue

                # Incoming timestamps are in milliseconds, so convert to POSIX first
                # (ie. seconds)
                submission_date = date.fromtimestamp(timestamp / 1000).strftime("%Y%m%d")
                path = fileutil.to_unicode(path)
                #print "Path for record", record_count, path, "length of data:", len_data

                if data[0] != "{":
                    # Data looks weird, should be JSON.
                    print self.label, "Warning: Found unexpected data for record", record_count, "in", raw_file, "path:", path, "data:"
                    print data
                else:
                    # Raw JSON, make sure we treat it as unicode.
                    data = fileutil.to_unicode(data)

                current_bytes = len_path + len_data + fileutil.RECORD_PREAMBLE_LENGTH["v1"]
                bytes_read += current_bytes
                self.bytes_read += current_bytes
                path_components = path.split("/")
                if len(path_components) != self.expected_dim_count:
                    # We're going to pop the ID off, but we'll also add the
                    # submission date, so it evens out.
                    print self.label, "Found an invalid path in record", record_count, path
                    continue

                key = path_components.pop(0)
                info = {}
                info["reason"] = path_components.pop(0)
                info["appName"] = path_components.pop(0)
                info["appVersion"] = path_components.pop(0)
                info["appUpdateChannel"] = path_components.pop(0)
                info["appBuildID"] = path_components.pop(0)
                dims = self.schema.dimensions_from(info, submission_date)

                try:
                    # Convert data:
                    if self.converter is None:
                        serialized_data = data
                        data_version = 1
                    else:
                        parsed_data, parsed_dims = self.converter.convert_json(data, dims[-1])
                        # TODO: take this out if it's too slow
                        for i in range(len(dims)):
                            if dims[i] != parsed_dims[i]:
                                print self.label, "Record", self.records_read, "mismatched dimension", i, dims[i], "!=", parsed_dims[i]
                        serialized_data = self.converter.serialize(parsed_data)
                        dims = parsed_dims
                        data_version = 2
                    try:
                        # Write to persistent storage
                        n = self.storage.write(key, serialized_data, dims, data_version)
                        self.bytes_written += len(key) + len(serialized_data) + 1
                        self.records_written += 1
                        # Compress rotated files as we generate them
                        if n.endswith(StorageLayout.PENDING_COMPRESSION_SUFFIX):
                            self.q_out.put(n)
                    except Exception, e:
                        self.write_bad_record(key, dims, serialized_data, str(e), "ERROR Writing to output file:")
                except BadPayloadError, e:
                    self.write_bad_record(key, dims, data, e.msg, "Bad Payload:")
                except Exception, e:
                    err_message = str(e)

                    # We don't need to write these bad records out - we know
                    # why they are being skipped.
                    if err_message != "Missing in payload: info.revision":
                        # TODO: recognize other common failure modes and handle them gracefully.
                        self.write_bad_record(key, dims, data, err_message, "Conversion Error:")
                        traceback.print_exc()
Exemplo n.º 8
0
    def handle(self, raw_file):
        self.log("Reading " + raw_file)
        try:
            record_count = 0
            bytes_read = 0
            start = now()
            file_version = fileutil.detect_file_version(raw_file,
                                                        simple_detection=True)
            self.log("Detected version {0} for file {1}".format(
                file_version, raw_file))
            for unpacked in fileutil.unpack(raw_file,
                                            file_version=file_version):
                record_count += 1
                common_bytes = unpacked.len_path + fileutil.RECORD_PREAMBLE_LENGTH[
                    file_version]
                current_bytes = common_bytes + unpacked.len_data
                current_bytes_uncompressed = common_bytes + len(unpacked.data)
                bytes_read += current_bytes
                if unpacked.error:
                    self.log("ERROR: Found corrupted data for record {0} in " \
                             "{1} path: {2} Error: {3}".format(record_count,
                                 raw_file, unpacked.path, unpacked.error))
                    self.stats.increment(
                        records_read=1,
                        bytes_read=current_bytes,
                        bytes_uncompressed=current_bytes_uncompressed,
                        bad_records=1,
                        bad_record_type="corrupted_data")
                    continue
                if len(unpacked.data) == 0:
                    self.log("WARN: Found empty data for record {0} in " \
                             "{2} path: {2}".format(record_count, raw_file,
                                 unpacked.path))
                    self.stats.increment(
                        records_read=1,
                        bytes_read=current_bytes,
                        bytes_uncompressed=current_bytes_uncompressed,
                        bad_records=1,
                        bad_record_type="empty_data")
                    continue

                submission_date = ts_to_yyyymmdd(unpacked.timestamp)
                path = fileutil.to_unicode(unpacked.path)

                if unpacked.data[0] != "{":
                    # Data looks weird, should be JSON.
                    self.log("Warning: Found unexpected data for record {0}" \
                             " in {1} path: {2} data:\n{3}".format(record_count,
                                 raw_file, path, unpacked.data))
                else:
                    # Raw JSON, make sure we treat it as unicode.
                    unpacked.data = fileutil.to_unicode(unpacked.data)

                path_components = path.split("/")
                if len(path_components) != self.expected_dim_count:
                    # We're going to pop the ID off, but we'll also add the
                    # submission date, so it evens out.
                    bad_record_type = "invalid_path"
                    if ReadRawStep.UUID_ONLY_PATH.match(path):
                        bad_record_type = "uuid_only_path"
                    else:
                        self.log("Found an invalid path in record {0}: " \
                             "{1}".format(record_count, path))
                    self.stats.increment(
                        records_read=1,
                        bytes_read=current_bytes,
                        bytes_uncompressed=current_bytes_uncompressed,
                        bad_records=1,
                        bad_record_type=bad_record_type)
                    continue

                key = path_components.pop(0)
                info = {}
                info["reason"] = path_components.pop(0)
                info["appName"] = path_components.pop(0)
                info["appVersion"] = path_components.pop(0)
                info["appUpdateChannel"] = path_components.pop(0)
                info["appBuildID"] = path_components.pop(0)
                dims = self.schema.dimensions_from(info, submission_date)
                channel = self.schema.get_field(dims, "appUpdateChannel", True,
                                                True)

                self.stats.increment(
                    channel=channel,
                    records_read=1,
                    bytes_read=current_bytes,
                    bytes_uncompressed=current_bytes_uncompressed)

                try:
                    # Convert data:
                    if self.converter is None:
                        serialized_data = unpacked.data
                        # TODO: Converter.VERSION_UNCONVERTED
                        data_version = 1
                    else:
                        parsed_data, parsed_dims = self.converter.convert_json(
                            unpacked.data, dims[-1], unpacked.ip)
                        # TODO: take this out if it's too slow
                        for i in range(len(dims)):
                            if dims[i] != parsed_dims[i]:
                                self.log("Record {0} mismatched dimension " \
                                         "{1}: '{2}' != '{3}'".format(
                                            record_count, i, dims[i],
                                            parsed_dims[i]))
                        serialized_data = self.converter.serialize(parsed_data)
                        dims = parsed_dims
                        # TODO: Converter.VERSION_CONVERTED
                        data_version = 2
                    try:
                        # Write to persistent storage
                        n = self.storage.write(key, serialized_data, dims,
                                               data_version)
                        self.stats.increment(channel=channel,
                                             records_written=1,
                                             bytes_written=len(key) +
                                             len(serialized_data) + 2)
                        # Compress rotated files as we generate them
                        if n.endswith(
                                StorageLayout.PENDING_COMPRESSION_SUFFIX):
                            self.q_out.put(n)
                    except Exception, e:
                        self.write_bad_record(key, dims, serialized_data,
                                              str(e),
                                              "ERROR Writing to output file:",
                                              "write_failed")
                except BadPayloadError, e:
                    self.write_bad_record(key, dims, unpacked.data, e.msg,
                                          "Bad Payload:", "bad_payload")
                except Exception, e:
                    err_message = str(e)
                    if err_message == "Missing in payload: info.revision":
                        # We don't need to write these bad records out - we know
                        # why they are being skipped.
                        self.stats.increment(
                            channel=channel,
                            bad_records=1,
                            bad_record_type="missing_revision")
                    elif err_message == "Invalid revision URL: /rev/":
                        # We do want to log these payloads, but we don't want
                        # the full stack trace.
                        self.write_bad_record(key, dims, unpacked.data,
                                              err_message, "Conversion Error",
                                              "missing_revision_repo")
                    # Don't split this long string - we want to be able to find it in the code
                    elif err_message.startswith(
                            "JSONDecodeError: Invalid control character"):
                        self.write_bad_record(key, dims, unpacked.data,
                                              err_message, "Conversion Error",
                                              "invalid_control_char")
                    else:
                        # TODO: Recognize other common failure modes and handle
                        #       them gracefully.
                        self.write_bad_record(key, dims, unpacked.data,
                                              err_message, "Conversion Error",
                                              "conversion_error")
                        self.log(traceback.format_exc())
Exemplo n.º 9
0
def main():
    parser = argparse.ArgumentParser(description='Split raw logs into partitioned files.', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("-m", "--max-output-size", metavar="N", help="Rotate output files after N bytes", type=int, default=500000000)
    parser.add_argument("-i", "--input-file", help="Filename to read from", required=True)
    parser.add_argument("-o", "--output-dir", help="Base directory to store split files", required=True)
    parser.add_argument("-t", "--telemetry-schema", help="Filename of telemetry schema spec", required=True)
    parser.add_argument("-f", "--file-version", help="Log file version (if omitted, we'll guess)")
    args = parser.parse_args()

    schema_data = open(args.telemetry_schema)
    schema = TelemetrySchema(json.load(schema_data))
    schema_data.close()

    storage = StorageLayout(schema, args.output_dir, args.max_output_size)

    expected_dim_count = len(schema._dimensions)

    if not os.path.isdir(args.output_dir):
        os.makedirs(args.output_dir)

    record_count = 0
    bad_record_count = 0
    bytes_read = 0
    start = datetime.now()
    file_version = args.file_version
    if not file_version:
        file_version = fileutil.detect_file_version(args.input_file)
    for r in fileutil.unpack(args.input_file, file_version=file_version):
        record_count += 1
        if r.error:
            bad_record_count += 1
            continue
        # Incoming timestamps are in milliseconds, so convert to POSIX first
        # (ie. seconds)
        submission_date = date.fromtimestamp(r.timestamp / 1000).strftime("%Y%m%d")
        # Deal with unicode
        path = fileutil.to_unicode(r.path)
        data = fileutil.to_unicode(r.data)

        bytes_read += r.len_ip + r.len_path + r.len_data + fileutil.RECORD_PREAMBLE_LENGTH[file_version]
        #print "Path for record", record_count, path, "length of data:", r.len_data, "data:", data[0:5] + "..."

        path_components = path.split("/")
        if len(path_components) != expected_dim_count:
            # We're going to pop the ID off, but we'll also add the submission
            # date, so it evens out.
            print "Found an invalid path in record", record_count, path
            bad_record_count += 1
            continue

        key = path_components.pop(0)
        info = {}
        info["reason"] = path_components.pop(0)
        info["appName"] = path_components.pop(0)
        info["appVersion"] = path_components.pop(0)
        info["appUpdateChannel"] = path_components.pop(0)
        info["appBuildID"] = path_components.pop(0)
        dimensions = schema.dimensions_from(info, submission_date)
        #print "  Converted path to filename", schema.get_filename(args.output_dir, dimensions)
        storage.write(key, data, dimensions)
    duration = timer.delta_sec(start)
    mb_read = bytes_read / 1024.0 / 1024.0
    print "Read %.2fMB in %.2fs (%.2fMB/s), %d of %d records were bad" % (mb_read, duration, mb_read / duration, bad_record_count, record_count)
    return 0