def test_profile_stream_error_no_data(self): def request_messages(): request_id = "123" for i in range(5): request = profiler_pb2.ProfileRequest(request_id=request_id) yield request # test num_returned = 0 message = profiler_pb2.ProfileDataStreamResponse() for profile in self._stub.ProfileDataStream(request_messages()): if profile.HasField("meta"): num_returned += 1 message.meta.request_id = profile.meta.request_id message.meta.schema = profile.meta.schema message.meta.total_records = profile.meta.total_records message.meta.service_version = profile.meta.service_version message.meta.schema_byte_size = profile.meta.schema_byte_size message.meta.profile_byte_size = profile.meta.profile_byte_size message.meta.error.message = profile.meta.error.message message.meta.error.type = profile.meta.error.type elif profile.HasField("profile"): num_returned += 1 message.profile += profile.profile self.assertEqual(json.loads(message.meta.request_id), 123) self.assertEqual(0, message.meta.total_records) self.assertEqual( message.meta.error.type, domain_pb2.ProfilerError.Type.Value('UNKNOWN_ENCODING'))
def test_profile_stream_size_huge_report_timeout(self): def request_messages(): request_id = "123" with open('testdata/huge-report.json') as json_file: data = json.load(json_file) for d in data: request = profiler_pb2.ProfileRequest( request_id=request_id, json_data=json.dumps(d)) yield request # test num_returned = 0 message = profiler_pb2.ProfileDataStreamResponse() for profile in self._stub.ProfileDataStream(request_messages()): if profile.HasField("meta"): num_returned += 1 message.meta.request_id = profile.meta.request_id message.meta.schema = profile.meta.schema message.meta.total_records = profile.meta.total_records message.meta.service_version = profile.meta.service_version message.meta.schema_byte_size = profile.meta.schema_byte_size message.meta.profile_byte_size = profile.meta.profile_byte_size self.assertEqual(json.loads(message.meta.request_id), 123) self.assertEqual(3, message.meta.total_records) self.assertSchema(message.meta.schema, 'field1', 'string', 'field101', 'string') elif profile.HasField("profile"): num_returned += 1 message.profile += profile.profile # Validate error thrown self.assertEqual(0, message.meta.error.type)
def test_profile_stream_size_toeggolomat(self): def request_messages(): request_id = "123" with open('testdata/toeggelomat_join.json') as json_file: data = json.load(json_file) for d in data: request = profiler_pb2.ProfileRequest( request_id=request_id, json_data=json.dumps(d)) yield request # test num_returned = 0 message = profiler_pb2.ProfileDataStreamResponse() for profile in self._stub.ProfileDataStream(request_messages()): if profile.HasField("meta"): num_returned += 1 message.meta.request_id = profile.meta.request_id message.meta.schema = profile.meta.schema message.meta.total_records = profile.meta.total_records message.meta.service_version = profile.meta.service_version message.meta.schema_byte_size = profile.meta.schema_byte_size message.meta.profile_byte_size = profile.meta.profile_byte_size self.assertEqual(json.loads(message.meta.request_id), 123) self.assertEqual(27, message.meta.total_records) self.assertSchema(message.meta.schema, 'matchUuid', 'string', 'blueScore', 'integer') elif profile.HasField("profile"): num_returned += 1 message.profile += profile.profile
def test_profile_stream_no_error_without_request_id(self): def request_messages(): for i in range(5): request = profiler_pb2.ProfileRequest( json_data=json.dumps({ 'x': 'a', 'y': 2 })) yield request # test num_returned = 0 message = profiler_pb2.ProfileDataStreamResponse() for profile in self._stub.ProfileDataStream(request_messages()): if profile.HasField("meta"): num_returned += 1 message.meta.request_id = profile.meta.request_id message.meta.schema = profile.meta.schema message.meta.total_records = profile.meta.total_records message.meta.service_version = profile.meta.service_version message.meta.schema_byte_size = profile.meta.schema_byte_size message.meta.profile_byte_size = profile.meta.profile_byte_size message.meta.error.message = profile.meta.error.message message.meta.error.type = profile.meta.error.type self.assertSchema(message.meta.schema, 'x', 'string', 'y', 'integer') self.assertEqual(5, message.meta.total_records) elif profile.HasField("profile"): num_returned += 1 message.profile += profile.profile self.assertRegex( message.profile, r'Profile report generated with the `pandas-profiling` Python package' )
def test_profile_stream_json_normalize(self): def request_messages(): request_id = "123" for i in range(5): request = profiler_pb2.ProfileRequest(request_id=request_id, json_data=json.dumps({ 'x': { 'test': 1, 'foo': 'bar' }, 'y': 2 })) yield request # test num_returned = 0 message = profiler_pb2.ProfileDataStreamResponse() for profile in self._stub.ProfileDataStream(request_messages()): if profile.HasField("meta"): num_returned += 1 message.meta.request_id = profile.meta.request_id message.meta.schema = profile.meta.schema message.meta.total_records = profile.meta.total_records message.meta.service_version = profile.meta.service_version message.meta.schema_byte_size = profile.meta.schema_byte_size message.meta.profile_byte_size = profile.meta.profile_byte_size self.assertEqual(json.loads(message.meta.request_id), 123) self.assertEqual(5, message.meta.total_records) self.assertSchema(message.meta.schema, 'x', 'object', 'foo', 'string') self.assertSchema(message.meta.schema, 'x', 'object', 'test', 'integer') elif profile.HasField("profile"): num_returned += 1 message.profile += profile.profile self.assertRegex( message.profile, r'Profile report generated with the `pandas-profiling` Python package' ) self.assertRegex(message.profile, r'x/foo') self.assertRegex(message.profile, r'x/test')
def test_profile_stream_not_so_huge_report_returns_2_profile_messages( self): def request_messages(): request_id = "123" with open('testdata/not-so-huge-report.json') as json_file: data = json.load(json_file) for d in data: request = profiler_pb2.ProfileRequest( request_id=request_id, json_data=json.dumps(d)) yield request # os.getenv('PROFILER_TIMEOUT', '30') # test num_returned = 0 message = profiler_pb2.ProfileDataStreamResponse() for profile in self._stub.ProfileDataStream(request_messages()): if profile.HasField("profile"): num_returned += 1 message.profile += profile.profile # here we make sure multiple messages were returned to fit the profile size self.assertEqual(num_returned, 2)
def ProfileDataStream(self, request_iterator, context): request_id = "none" builder = SchemaBuilder() builder.add_schema({"type": "object", "properties": {}}) error = domain_pb2.ProfilerError( type=domain_pb2.ProfilerError.Type.Value('UNKNOWN')) message = profiler_pb2.ProfileDataStreamResponse() total_records = 0 record_list = [] try: for record in request_iterator: total_records += 1 request_id = record.request_id if total_records == 1: logging.info( 'started profiling for request %s with config %s' % (request_id, self.config_path)) json_data = json.loads(record.json_data) record_list.append(json_data) for jd in record_list: builder.add_object(jd) data_frame = pd.DataFrame(json_normalize(record_list, sep='/')) profile = None report_length = 0 try: profile = run_profiler(data_frame) except FunctionTimedOut as te: err_msg = 'profile timeout for request_id %s after %ss data_frame shape (rows, cols): %s' % \ (request_id, te.timedOutAfter, data_frame.shape) logging.warning(err_msg) error = domain_pb2.ProfilerError( message=err_msg, type=domain_pb2.ProfilerError.Type.Value( 'PROFILE_EXCEPTION')) except Exception as e: logging.error('generic exception in timeout', e) error = domain_pb2.ProfilerError( message=str(e), type=domain_pb2.ProfilerError.Type.Value( 'PROFILE_EXCEPTION')) schema = builder.to_schema() if profile is not None: html = profile.to_html() html = minify(html, remove_all_empty_space=True, remove_comments=True) report_length = len(html) schema_json = json.dumps(schema) schema_length = len(schema_json) logging.info( 'profiling complete for request %s total_records: %s, schema_length: %s, report_length: %s' % (request_id, total_records, schema_length, report_length)) profile_stream = [] # The max message size of a GRPC call in bytes is 4194304. The header includes 5 bytes, 1 for # the compressed flag and 4 for the unsigned integer. Therefore should be 4194299 MAX_MESSAGE_SIZE = 4194299 if report_length == 0 or html is None: profile_stream.append('') elif report_length < MAX_MESSAGE_SIZE: profile_stream.append(html) else: last = 0 while last + MAX_MESSAGE_SIZE < report_length: profile_stream.append(html[last:last + MAX_MESSAGE_SIZE]) last = last + MAX_MESSAGE_SIZE profile_stream.append(html[last:report_length]) if error is not None and error.type != domain_pb2.ProfilerError.Type.Value( 'UNKNOWN'): message.meta.error.message = error.message message.meta.error.type = error.type message.meta.request_id = request_id message.meta.schema = schema_json message.meta.total_records = total_records message.meta.service_version = os.getenv( 'SDM_PROFILER_SERVICE_VERSION', 'default') message.meta.schema_byte_size = schema_length message.meta.profile_byte_size = report_length yield message for idx, profile_portion in enumerate(profile_stream): message = profiler_pb2.ProfileDataStreamResponse() message.profile = profile_portion yield message return except json.decoder.JSONDecodeError as e: first_chars = '><' if record is not None and record.json_data is not None: first_chars = '>' + record.json_data[0:10] + '<' err_msg = 'profiling failed for request %s with error %s %s, record nr: %s first 10 chars %s' % \ (request_id, type(e), e, total_records, first_chars) logging.error(err_msg) error = domain_pb2.ProfilerError( message=err_msg, type=domain_pb2.ProfilerError.Type.Value('UNKNOWN_ENCODING')) except Exception as e: logging.error('profiling failed for request %s with error %s' % (request_id, e)) error = domain_pb2.ProfilerError( message=str(e), type=domain_pb2.ProfilerError.Type.Value('NO_DATA')) message.meta.request_id = request_id message.meta.error.message = error.message message.meta.error.type = error.type yield message