예제 #1
0
    def test_profile_stream_error_no_data(self):
        def request_messages():
            request_id = "123"
            for i in range(5):
                request = profiler_pb2.ProfileRequest(request_id=request_id)
                yield request

        # test
        num_returned = 0
        message = profiler_pb2.ProfileDataStreamResponse()
        for profile in self._stub.ProfileDataStream(request_messages()):
            if profile.HasField("meta"):
                num_returned += 1
                message.meta.request_id = profile.meta.request_id
                message.meta.schema = profile.meta.schema
                message.meta.total_records = profile.meta.total_records
                message.meta.service_version = profile.meta.service_version
                message.meta.schema_byte_size = profile.meta.schema_byte_size
                message.meta.profile_byte_size = profile.meta.profile_byte_size
                message.meta.error.message = profile.meta.error.message
                message.meta.error.type = profile.meta.error.type
            elif profile.HasField("profile"):
                num_returned += 1
                message.profile += profile.profile

        self.assertEqual(json.loads(message.meta.request_id), 123)
        self.assertEqual(0, message.meta.total_records)
        self.assertEqual(
            message.meta.error.type,
            domain_pb2.ProfilerError.Type.Value('UNKNOWN_ENCODING'))
예제 #2
0
    def test_profile_stream_size_huge_report_timeout(self):
        def request_messages():
            request_id = "123"
            with open('testdata/huge-report.json') as json_file:
                data = json.load(json_file)
                for d in data:
                    request = profiler_pb2.ProfileRequest(
                        request_id=request_id, json_data=json.dumps(d))
                    yield request

        # test
        num_returned = 0
        message = profiler_pb2.ProfileDataStreamResponse()
        for profile in self._stub.ProfileDataStream(request_messages()):
            if profile.HasField("meta"):
                num_returned += 1
                message.meta.request_id = profile.meta.request_id
                message.meta.schema = profile.meta.schema
                message.meta.total_records = profile.meta.total_records
                message.meta.service_version = profile.meta.service_version
                message.meta.schema_byte_size = profile.meta.schema_byte_size
                message.meta.profile_byte_size = profile.meta.profile_byte_size

                self.assertEqual(json.loads(message.meta.request_id), 123)
                self.assertEqual(3, message.meta.total_records)
                self.assertSchema(message.meta.schema, 'field1', 'string',
                                  'field101', 'string')

            elif profile.HasField("profile"):
                num_returned += 1
                message.profile += profile.profile

        # Validate error thrown
        self.assertEqual(0, message.meta.error.type)
예제 #3
0
    def test_profile_stream_size_toeggolomat(self):
        def request_messages():
            request_id = "123"
            with open('testdata/toeggelomat_join.json') as json_file:
                data = json.load(json_file)
                for d in data:
                    request = profiler_pb2.ProfileRequest(
                        request_id=request_id, json_data=json.dumps(d))
                    yield request

        # test
        num_returned = 0
        message = profiler_pb2.ProfileDataStreamResponse()
        for profile in self._stub.ProfileDataStream(request_messages()):
            if profile.HasField("meta"):
                num_returned += 1
                message.meta.request_id = profile.meta.request_id
                message.meta.schema = profile.meta.schema
                message.meta.total_records = profile.meta.total_records
                message.meta.service_version = profile.meta.service_version
                message.meta.schema_byte_size = profile.meta.schema_byte_size
                message.meta.profile_byte_size = profile.meta.profile_byte_size

                self.assertEqual(json.loads(message.meta.request_id), 123)
                self.assertEqual(27, message.meta.total_records)
                self.assertSchema(message.meta.schema, 'matchUuid', 'string',
                                  'blueScore', 'integer')
            elif profile.HasField("profile"):
                num_returned += 1
                message.profile += profile.profile
예제 #4
0
    def test_profile_stream_no_error_without_request_id(self):
        def request_messages():
            for i in range(5):
                request = profiler_pb2.ProfileRequest(
                    json_data=json.dumps({
                        'x': 'a',
                        'y': 2
                    }))
                yield request

        # test
        num_returned = 0
        message = profiler_pb2.ProfileDataStreamResponse()
        for profile in self._stub.ProfileDataStream(request_messages()):
            if profile.HasField("meta"):
                num_returned += 1
                message.meta.request_id = profile.meta.request_id
                message.meta.schema = profile.meta.schema
                message.meta.total_records = profile.meta.total_records
                message.meta.service_version = profile.meta.service_version
                message.meta.schema_byte_size = profile.meta.schema_byte_size
                message.meta.profile_byte_size = profile.meta.profile_byte_size
                message.meta.error.message = profile.meta.error.message
                message.meta.error.type = profile.meta.error.type
                self.assertSchema(message.meta.schema, 'x', 'string', 'y',
                                  'integer')
                self.assertEqual(5, message.meta.total_records)
            elif profile.HasField("profile"):
                num_returned += 1
                message.profile += profile.profile

        self.assertRegex(
            message.profile,
            r'Profile report generated with the `pandas-profiling` Python package'
        )
예제 #5
0
    def test_profile_stream_json_normalize(self):
        def request_messages():
            request_id = "123"
            for i in range(5):
                request = profiler_pb2.ProfileRequest(request_id=request_id,
                                                      json_data=json.dumps({
                                                          'x': {
                                                              'test': 1,
                                                              'foo': 'bar'
                                                          },
                                                          'y':
                                                          2
                                                      }))
                yield request

        # test
        num_returned = 0
        message = profiler_pb2.ProfileDataStreamResponse()
        for profile in self._stub.ProfileDataStream(request_messages()):
            if profile.HasField("meta"):
                num_returned += 1
                message.meta.request_id = profile.meta.request_id
                message.meta.schema = profile.meta.schema
                message.meta.total_records = profile.meta.total_records
                message.meta.service_version = profile.meta.service_version
                message.meta.schema_byte_size = profile.meta.schema_byte_size
                message.meta.profile_byte_size = profile.meta.profile_byte_size
                self.assertEqual(json.loads(message.meta.request_id), 123)
                self.assertEqual(5, message.meta.total_records)
                self.assertSchema(message.meta.schema, 'x', 'object', 'foo',
                                  'string')
                self.assertSchema(message.meta.schema, 'x', 'object', 'test',
                                  'integer')
            elif profile.HasField("profile"):
                num_returned += 1
                message.profile += profile.profile

        self.assertRegex(
            message.profile,
            r'Profile report generated with the `pandas-profiling` Python package'
        )
        self.assertRegex(message.profile, r'x/foo')
        self.assertRegex(message.profile, r'x/test')
예제 #6
0
    def test_profile_stream_not_so_huge_report_returns_2_profile_messages(
            self):
        def request_messages():
            request_id = "123"
            with open('testdata/not-so-huge-report.json') as json_file:
                data = json.load(json_file)
                for d in data:
                    request = profiler_pb2.ProfileRequest(
                        request_id=request_id, json_data=json.dumps(d))
                    yield request

        # os.getenv('PROFILER_TIMEOUT', '30')
        # test
        num_returned = 0
        message = profiler_pb2.ProfileDataStreamResponse()
        for profile in self._stub.ProfileDataStream(request_messages()):
            if profile.HasField("profile"):
                num_returned += 1
                message.profile += profile.profile

        # here we make sure multiple messages were returned to fit the profile size
        self.assertEqual(num_returned, 2)
예제 #7
0
    def ProfileDataStream(self, request_iterator, context):
        request_id = "none"
        builder = SchemaBuilder()
        builder.add_schema({"type": "object", "properties": {}})
        error = domain_pb2.ProfilerError(
            type=domain_pb2.ProfilerError.Type.Value('UNKNOWN'))

        message = profiler_pb2.ProfileDataStreamResponse()
        total_records = 0
        record_list = []

        try:
            for record in request_iterator:
                total_records += 1
                request_id = record.request_id
                if total_records == 1:
                    logging.info(
                        'started profiling for request %s with config %s' %
                        (request_id, self.config_path))
                json_data = json.loads(record.json_data)
                record_list.append(json_data)

            for jd in record_list:
                builder.add_object(jd)
            data_frame = pd.DataFrame(json_normalize(record_list, sep='/'))

            profile = None
            report_length = 0
            try:
                profile = run_profiler(data_frame)
            except FunctionTimedOut as te:
                err_msg = 'profile timeout for request_id %s after %ss data_frame shape (rows, cols): %s' % \
                          (request_id, te.timedOutAfter, data_frame.shape)
                logging.warning(err_msg)
                error = domain_pb2.ProfilerError(
                    message=err_msg,
                    type=domain_pb2.ProfilerError.Type.Value(
                        'PROFILE_EXCEPTION'))

            except Exception as e:
                logging.error('generic exception in timeout', e)
                error = domain_pb2.ProfilerError(
                    message=str(e),
                    type=domain_pb2.ProfilerError.Type.Value(
                        'PROFILE_EXCEPTION'))

            schema = builder.to_schema()

            if profile is not None:
                html = profile.to_html()

                html = minify(html,
                              remove_all_empty_space=True,
                              remove_comments=True)

                report_length = len(html)

            schema_json = json.dumps(schema)
            schema_length = len(schema_json)
            logging.info(
                'profiling complete for request %s total_records: %s, schema_length: %s, report_length: %s'
                % (request_id, total_records, schema_length, report_length))

            profile_stream = []

            # The max message size of a GRPC call in bytes is 4194304. The header includes 5 bytes, 1 for
            # the compressed flag and 4 for the unsigned integer. Therefore should be 4194299
            MAX_MESSAGE_SIZE = 4194299

            if report_length == 0 or html is None:
                profile_stream.append('')
            elif report_length < MAX_MESSAGE_SIZE:
                profile_stream.append(html)
            else:
                last = 0
                while last + MAX_MESSAGE_SIZE < report_length:
                    profile_stream.append(html[last:last + MAX_MESSAGE_SIZE])
                    last = last + MAX_MESSAGE_SIZE
                profile_stream.append(html[last:report_length])

            if error is not None and error.type != domain_pb2.ProfilerError.Type.Value(
                    'UNKNOWN'):
                message.meta.error.message = error.message
                message.meta.error.type = error.type

            message.meta.request_id = request_id
            message.meta.schema = schema_json
            message.meta.total_records = total_records
            message.meta.service_version = os.getenv(
                'SDM_PROFILER_SERVICE_VERSION', 'default')
            message.meta.schema_byte_size = schema_length
            message.meta.profile_byte_size = report_length

            yield message

            for idx, profile_portion in enumerate(profile_stream):
                message = profiler_pb2.ProfileDataStreamResponse()
                message.profile = profile_portion
                yield message
            return

        except json.decoder.JSONDecodeError as e:
            first_chars = '><'
            if record is not None and record.json_data is not None:
                first_chars = '>' + record.json_data[0:10] + '<'
            err_msg = 'profiling failed for request %s with error %s %s, record nr: %s first 10 chars %s' % \
                      (request_id, type(e), e, total_records, first_chars)
            logging.error(err_msg)
            error = domain_pb2.ProfilerError(
                message=err_msg,
                type=domain_pb2.ProfilerError.Type.Value('UNKNOWN_ENCODING'))

        except Exception as e:
            logging.error('profiling failed for request %s with error %s' %
                          (request_id, e))
            error = domain_pb2.ProfilerError(
                message=str(e),
                type=domain_pb2.ProfilerError.Type.Value('NO_DATA'))

        message.meta.request_id = request_id
        message.meta.error.message = error.message
        message.meta.error.type = error.type
        yield message