def test_sql_select_csv_no_header(client, log_output): json_testcontent = """val1,val2,val3 val4,val5,val6 """ tests = [ ("select_1", "SELECT s._2 FROM S3Object as s", b'val2\nval5\n'), ] input_serialization = InputSerialization(csv=CSVInput( file_header_info="NONE", allow_quoted_record_delimiter="FALSE", ), ) output_serialization = OutputSerialization(csv=CSVOutput()) try: test_sql_expressions_custom_input_output(client, json_testcontent, input_serialization, output_serialization, tests, log_output) except Exception as select_err: raise select_err # raise ValueError('Test {} unexpectedly failed with: {}'.format(test_name, select_err)) # pass # Test passes print(log_output.json_report())
def test_sql_select_json(client, log_output): json_testcontent = """{ "Rules": [ {"id": "1"}, {"expr": "y > x"}, {"id": "2", "expr": "z = DEBUG"} ]} { "created": "June 27", "modified": "July 6" } """ tests = [ ("select_1", "SELECT id FROM S3Object[*].Rules[*].id", b'{"id":"1"}\n{}\n{"id":"2"}\n{}\n'), ("select_2", "SELECT id FROM S3Object[*].Rules[*].id WHERE id IS NOT MISSING", b'{"id":"1"}\n{"id":"2"}\n'), ("select_3", "SELECT d.created, d.modified FROM S3Object[*] d", b'{}\n{"created":"June 27","modified":"July 6"}\n'), ("select_4", "SELECT _1.created, _1.modified FROM S3Object[*]", b'{}\n{"created":"June 27","modified":"July 6"}\n'), ("select_5", "Select s.rules[1].expr from S3Object s", b'{"expr":"y > x"}\n{}\n'), ] input_serialization = InputSerialization(json=JSONInput( json_type="DOCUMENT")) output_serialization = OutputSerialization(json=JSONOutput()) try: test_sql_expressions_custom_input_output(client, json_testcontent, input_serialization, output_serialization, tests, log_output) except Exception as select_err: raise select_err # raise ValueError('Test {} unexpectedly failed with: {}'.format(test_name, select_err)) # pass # Test passes print(log_output.json_report())
def test_csv_output_custom_quote_char(client, log_output): # Get a unique bucket_name and object_name log_output.args['bucket_name'] = bucket_name = generate_bucket_name() tests = [ # UTF-8 quote character ("''", "''", b'col1,col2,col3\n', Exception()), ("'", "'", b'col1,col2,col3\n', b"'col1','col2','col3'\n"), ("", '"', b'col1,col2,col3\n', b'\x00col1\x00,\x00col2\x00,\x00col3\x00\n'), ('"', '"', b'col1,col2,col3\n', b'"col1","col2","col3"\n'), ('"', '"', b'col"1,col2,col3\n', b'"col""1","col2","col3"\n'), ('"', '"', b'""""\n', b'""""\n'), ('"', '"', b'\n', b''), ("'", "\\", b'col1,col2,col3\n', b"'col1','col2','col3'\n"), ("'", "\\", b'col""1,col2,col3\n', b"'col\"\"1','col2','col3'\n"), ("'", "\\", b'col\'1,col2,col3\n', b"'col\\'1','col2','col3'\n"), ("'", "\\", b'"col\'1","col2","col3"\n', b"'col\\'1','col2','col3'\n"), ("'", "\\", b'col\'\n', b"'col\\''\n"), # Two consecutive escaped quotes ("'", "\\", b'"a"""""\n', b"'a\"\"'\n"), ] client.make_bucket(bucket_name) try: for idx, (quote_char, escape_char, input_data, expected_output) in enumerate(tests): sql_opts = SelectObjectOptions( expression="select * from s3object", input_serialization=InputSerialization( compression_type="NONE", csv=CSVInput( FileHeaderInfo="NONE", RecordDelimiter="\n", FieldDelimiter=",", QuoteCharacter='"', QuoteEscapeCharacter='"', Comments="#", AllowQuotedRecordDelimiter="FALSE", ), ), output_serialization=OutputSerialization(csv=CSVOutput( QuoteFields="ALWAYS", RecordDelimiter="\n", FieldDelimiter=",", QuoteCharacter=quote_char, QuoteEscapeCharacter=escape_char, )), request_progress=RequestProgress(enabled="False")) test_sql_api(f'test_{idx}', client, bucket_name, input_data, sql_opts, expected_output) finally: client.remove_bucket(bucket_name) # Test passes print(log_output.json_report())
def test_xml_marshal_select(self): expected_string = (b'<SelectObjectContentRequest>' b'<Expression>select * from s3object</Expression>' b'<ExpressionType>SQL</ExpressionType>' b'<InputSerialization>' b'<CompressionType>NONE</CompressionType>' b'<CSV><FileHeaderInfo>USE</FileHeaderInfo>' b'<RecordDelimiter>\n</RecordDelimiter>' b'<FieldDelimiter>,</FieldDelimiter>' b'<QuoteCharacter>"</QuoteCharacter>' b'<QuoteEscapeCharacter>"</QuoteEscapeCharacter>' b'<Comments>#</Comments>' b'<AllowQuotedRecordDelimiter>false' b'</AllowQuotedRecordDelimiter></CSV>' b'</InputSerialization>' b'<OutputSerialization><CSV>' b'<QuoteFields>ASNEEDED</QuoteFields>' b'<RecordDelimiter>\n</RecordDelimiter>' b'<FieldDelimiter>,</FieldDelimiter>' b'<QuoteCharacter>"</QuoteCharacter>' b'<QuoteEscapeCharacter>"</QuoteEscapeCharacter>' b'</CSV></OutputSerialization>' b'<RequestProgress>' b'<Enabled>true</Enabled>' b'</RequestProgress>' b'</SelectObjectContentRequest>') options = SelectObjectOptions( expression="select * from s3object", input_serialization=InputSerialization( compression_type="NONE", csv=CSVInput(FileHeaderInfo="USE", RecordDelimiter="\n", FieldDelimiter=",", QuoteCharacter='"', QuoteEscapeCharacter='"', Comments="#", AllowQuotedRecordDelimiter="FALSE"), ), output_serialization=OutputSerialization( csv=CSVOutput(QuoteFields="ASNEEDED", RecordDelimiter="\n", FieldDelimiter=",", QuoteCharacter='"', QuoteEscapeCharacter='"') ), request_progress=RequestProgress( enabled="TRUE" ) ) actual_string = xml_marshal_select(options) eq_(expected_string, actual_string)
def test_sql_expressions(client, input_json_bytes, tests, log_output): input_serialization = InputSerialization( compression_type="NONE", json=JSONInput(json_type="DOCUMENT"), ) output_serialization = OutputSerialization(csv=CSVOutput( quote_fields="ASNEEDED")) test_sql_expressions_custom_input_output(client, input_json_bytes, input_serialization, output_serialization, tests, log_output)
def test_csv_input_custom_quote_char(client, log_output): # Get a unique bucket_name and object_name log_output.args['bucket_name'] = bucket_name = generate_bucket_name() tests = [ # Invalid quote character, should fail ('""', '"', b'col1,col2,col3\n', Exception()), # UTF-8 quote character ('ع', '"', 'عcol1ع,عcol2ع,عcol3ع\n'.encode(), b'{"_1":"col1","_2":"col2","_3":"col3"}\n'), # Only one field is quoted ('"', '"', b'"col1",col2,col3\n', b'{"_1":"col1","_2":"col2","_3":"col3"}\n'), ('"', '"', b'"col1,col2,col3"\n', b'{"_1":"col1,col2,col3"}\n'), ('\'', '"', b'"col1",col2,col3\n', b'{"_1":"\\"col1\\"","_2":"col2","_3":"col3"}\n'), ('', '"', b'"col1",col2,col3\n', b'{"_1":"\\"col1\\"","_2":"col2","_3":"col3"}\n'), ('', '"', b'"col1",col2,col3\n', b'{"_1":"\\"col1\\"","_2":"col2","_3":"col3"}\n'), ('', '"', b'"col1","col2","col3"\n', b'{"_1":"\\"col1\\"","_2":"\\"col2\\"","_3":"\\"col3\\""}\n'), ('"', '"', b'""""""\n', b'{"_1":"\\"\\""}\n'), ('"', '"', b'A",B\n', b'{"_1":"A\\"","_2":"B"}\n'), ('"', '"', b'A"",B\n', b'{"_1":"A\\"\\"","_2":"B"}\n'), ('"', '\\', b'A\\B,C\n', b'{"_1":"A\\\\B","_2":"C"}\n'), ('"', '"', b'"A""B","CD"\n', b'{"_1":"A\\"B","_2":"CD"}\n'), ('"', '\\', b'"A\\B","CD"\n', b'{"_1":"AB","_2":"CD"}\n'), ('"', '\\', b'"A\\,","CD"\n', b'{"_1":"A,","_2":"CD"}\n'), ('"', '\\', b'"A\\"B","CD"\n', b'{"_1":"A\\"B","_2":"CD"}\n'), ('"', '\\', b'"A\\""\n', b'{"_1":"A\\""}\n'), ('"', '\\', b'"A\\"\\"B"\n', b'{"_1":"A\\"\\"B"}\n'), ('"', '\\', b'"A\\"","\\"B"\n', b'{"_1":"A\\"","_2":"\\"B"}\n'), ] client.make_bucket(bucket_name) try: for idx, (quote_char, escape_char, data, expected_output) in enumerate(tests): sql_opts = SelectObjectOptions( expression="select * from s3object", input_serialization=InputSerialization( compression_type="NONE", csv=CSVInput( FileHeaderInfo="NONE", RecordDelimiter="\n", FieldDelimiter=",", QuoteCharacter=quote_char, QuoteEscapeCharacter=escape_char, Comments="#", AllowQuotedRecordDelimiter="FALSE", ), ), output_serialization=OutputSerialization( json=JsonOutput(RecordDelimiter="\n", )), request_progress=RequestProgress(enabled="False")) test_sql_api(f'test_{idx}', client, bucket_name, data, sql_opts, expected_output) finally: client.remove_bucket(bucket_name) # Test passes print(log_output.json_report())
def test_csv_output_quote_char(client, log_output): # Get a unique bucket_name and object_name log_output.args['bucket_name'] = bucket_name = generate_bucket_name() tests = [ # UTF-8 quote character ("''", b'col1,col2,col3\n', Exception()), ("'", b'col1,col2,col3\n', b"'col1','col2','col3'\n"), ("", b'col1,col2,col3\n', b'\x00col1\x00,\x00col2\x00,\x00col3\x00\n'), ('"', b'col1,col2,col3\n', b'"col1","col2","col3"\n'), ('"', b'col"1,col2,col3\n', b'"col""1","col2","col3"\n'), ('"', b'\n', b''), ] try: client.make_bucket(bucket_name) for idx, (quote_char, object_content, expected_output) in enumerate(tests): options = SelectObjectOptions( expression="select * from s3object", input_serialization=InputSerialization( compression_type="NONE", csv=CSVInput( FileHeaderInfo="NONE", RecordDelimiter="\n", FieldDelimiter=",", QuoteCharacter='"', QuoteEscapeCharacter='"', Comments="#", AllowQuotedRecordDelimiter="FALSE", ), ), output_serialization=OutputSerialization(csv=CSVOutput( QuoteFields="ALWAYS", RecordDelimiter="\n", FieldDelimiter=",", QuoteCharacter=quote_char, QuoteEscapeCharacter=quote_char, )), request_progress=RequestProgress(enabled="False")) got_output = b'' try: got_output = exec_select(client, bucket_name, object_content, options, log_output) except Exception as select_err: if not isinstance(expected_output, Exception): raise ValueError( 'Test {} unexpectedly failed with: {}'.format( idx + 1, select_err)) else: if isinstance(expected_output, Exception): raise ValueError( 'Test {}: expected an exception, got {}'.format( idx + 1, got_output)) if got_output != expected_output: raise ValueError( 'Test {}: data mismatch. Expected : {}. Received: {}.'. format(idx + 1, expected_output, got_output)) except Exception as err: raise Exception(err) finally: try: client.remove_bucket(bucket_name) except Exception as err: raise Exception(err) # Test passes print(log_output.json_report())
def test_csv_input_quote_char(client, log_output): # Get a unique bucket_name and object_name log_output.args['bucket_name'] = bucket_name = generate_bucket_name() tests = [ # Invalid quote character, should fail ('""', b'col1,col2,col3\n', Exception()), # UTF-8 quote character ('ع', b'\xd8\xb9col1\xd8\xb9,\xd8\xb9col2\xd8\xb9,\xd8\xb9col3\xd8\xb9\n', b'{"_1":"col1","_2":"col2","_3":"col3"}\n'), # Only one field is quoted ('"', b'"col1",col2,col3\n', b'{"_1":"col1","_2":"col2","_3":"col3"}\n' ), ('"', b'"col1,col2,col3"\n', b'{"_1":"col1,col2,col3"}\n'), ('\'', b'"col1",col2,col3\n', b'{"_1":"\\"col1\\"","_2":"col2","_3":"col3"}\n'), ('', b'"col1",col2,col3\n', b'{"_1":"\\"col1\\"","_2":"col2","_3":"col3"}\n'), ('', b'"col1",col2,col3\n', b'{"_1":"\\"col1\\"","_2":"col2","_3":"col3"}\n'), ('', b'"col1","col2","col3"\n', b'{"_1":"\\"col1\\"","_2":"\\"col2\\"","_3":"\\"col3\\""}\n'), ('"', b'""""""\n', b'{"_1":"\\"\\""}\n'), ] try: client.make_bucket(bucket_name) for idx, (quote_char, object_content, expected_output) in enumerate(tests): options = SelectObjectOptions( expression="select * from s3object", input_serialization=InputSerialization( compression_type="NONE", csv=CSVInput( FileHeaderInfo="NONE", RecordDelimiter="\n", FieldDelimiter=",", QuoteCharacter=quote_char, QuoteEscapeCharacter=quote_char, Comments="#", AllowQuotedRecordDelimiter="FALSE", ), ), output_serialization=OutputSerialization( json=JsonOutput(RecordDelimiter="\n", )), request_progress=RequestProgress(enabled="False")) got_output = b'' try: got_output = exec_select(client, bucket_name, object_content, options, log_output) except Exception as select_err: if not isinstance(expected_output, Exception): raise ValueError( 'Test {} unexpectedly failed with: {}'.format( idx + 1, select_err)) else: if isinstance(expected_output, Exception): raise ValueError( 'Test {}: expected an exception, got {}'.format( idx + 1, got_output)) if got_output != expected_output: raise ValueError( 'Test {}: data mismatch. Expected : {}, Received {}'. format(idx + 1, expected_output, got_output)) except Exception as err: raise Exception(err) finally: try: client.remove_bucket(bucket_name) except Exception as err: raise Exception(err) # Test passes print(log_output.json_report())
InputSerialization, OutputSerialization, CSVOutput, JsonOutput) client = Minio('s3.amazonaws.com', access_key='YOUR-ACCESSKEY', secret_key='YOUR-SECRETKEY') options = SelectObjectOptions( expression="select * from s3object", input_serialization=InputSerialization( compression_type="NONE", csv=CSVInput( FileHeaderInfo="USE", RecordDelimiter="\n", FieldDelimiter=",", QuoteCharacter='"', QuoteEscapeCharacter='"', Comments="#", AllowQuotedRecordDelimiter="FALSE", ), # If input is JSON # json=JSONInput(Type="DOCUMENT",) ), output_serialization=OutputSerialization( csv=CSVOutput( QuoteFields="ASNEEDED", RecordDelimiter="\n", FieldDelimiter=",", QuoteCharacter='"', QuoteEscapeCharacter='"', )
# from minio.select.options import JsonInput # from minio.select.options import ParquetInput client = Minio('s3.amazonaws.com', access_key='YOUR-ACCESSKEY', secret_key='YOUR-SECRETKEY') options = SelectObjectOptions( expression="select * from s3object", input_serialization=InputSerialization( compression_type="NONE", csv=CSVInput( file_header_info="USE", record_delimiter="\n", field_delimiter=",", quote_character='"', quote_escape_character='"', comments="#", allow_quoted_record_delimiter="FALSE", ), # If input is JSON # json=JSONInput(json_type="DOCUMENT") ), output_serialization=OutputSerialization( csv=CSVOutput( quote_fields="ASNEEDED", record_delimiter="\n", field_delimiter=",", quote_character='"', quote_escape_character='"', ),