def _cleanup_minio(): if MINIO.bucket_exists(S3_BUCKET): objects = [ o.object_name for o in MINIO.list_objects(bucket_name=S3_BUCKET) ] # remove_objects is an iterator, so we force evaluate it list(MINIO.remove_objects(bucket_name=S3_BUCKET, objects_iter=objects))
def clean_minio(): if not MINIO.bucket_exists(S3_BUCKET): MINIO.make_bucket(S3_BUCKET) # Make sure to delete extra objects in the remote Minio bucket _cleanup_minio() yield MINIO # Comment this out if tests fail and you want to see what the hell went on in the bucket. _cleanup_minio()
def clean_minio(): try: MINIO.make_bucket(S3_BUCKET) except BucketAlreadyExists: pass except BucketAlreadyOwnedByYou: pass # Make sure to delete extra objects in the remote Minio bucket _cleanup_minio() yield MINIO # Comment this out if tests fail and you want to see what the hell went on in the bucket. _cleanup_minio()
def test_csv_introspection_http(): # Pre-sign the S3 URL for an easy HTTP URL to test this schema = CSVForeignDataWrapper.import_schema( schema=None, srv_options={ "url": MINIO.presigned_get_object("test_csv", "some_prefix/fruits.csv") }, options={}, restriction_type=None, restricts=[], ) assert len(schema) == 1 assert schema[0] == { "table_name": "data", "schema": None, "columns": [ { "column_name": "fruit_id", "type_name": "integer" }, { "column_name": "timestamp", "type_name": "timestamp" }, { "column_name": "name", "type_name": "character varying" }, { "column_name": "number", "type_name": "integer" }, { "column_name": "bignumber", "type_name": "bigint" }, { "column_name": "vbignumber", "type_name": "numeric" }, ], "options": { "autodetect_dialect": "false", "autodetect_encoding": "false", "autodetect_header": "false", "delimiter": ",", "encoding": "utf-8", "header": "true", "quotechar": '"', }, }
def test_csv_data_source_http(local_engine_empty): source = CSVDataSource( local_engine_empty, credentials={}, params={ "url": MINIO.presigned_get_object("test_csv", "some_prefix/rdu-weather-history.csv"), }, ) schema = source.introspect() assert len(schema.keys()) == 1 assert len(schema["data"][0]) == 28 preview = source.preview(schema) assert len(preview.keys()) == 1 assert len(preview["data"]) == 10
def test_csv_data_source_raw_url(local_engine_empty): # Use the data from the previous test to test out the raw URL functionality url = MINIO.presigned_get_object("test_csv", "some_prefix/rdu-weather-history.csv") credentials = { "s3_access_key": "minioclient", "s3_secret_key": "supersecure", } params = { "s3_endpoint": "objectstorage:9000", "s3_secure": False, "s3_bucket": "test_csv", # Put this delimiter in as a canary to make sure table params override server params. "delimiter": ",", } tables = { "from_url": ([], { "url": url }), "from_s3_rdu": ([], { "s3_object": "some_prefix/rdu-weather-history.csv" }), "from_s3_encoding": ([], { "s3_object": "some_prefix/encoding-win-1252.csv" }), "from_url_broken": ([], { "url": "invalid_url" }), "from_s3_broken": ([], { "s3_object": "invalid_object" }), } source = CSVDataSource( local_engine_empty, credentials, params, tables, ) schema = source.introspect() schema = unwrap(schema)[0] raw_urls = source.get_raw_url(tables=schema) assert raw_urls == { "from_s3_encoding": [( "text/csv", mock.ANY, )], "from_s3_rdu": [( "text/csv", mock.ANY, )], "from_url": [( "text/csv", url, )], } assert "objectstorage:9000" in raw_urls["from_s3_encoding"][0][1] assert "objectstorage:9000" in raw_urls["from_s3_rdu"][0][1]
def test_csv_data_source_multiple(local_engine_empty): # End-to-end version for test_csv_introspection_multiple to check things like table params # getting serialized and deserialized properly. url = MINIO.presigned_get_object("test_csv", "some_prefix/rdu-weather-history.csv") credentials = { "s3_access_key": "minioclient", "s3_secret_key": "supersecure", } params = { "s3_endpoint": "objectstorage:9000", "s3_secure": False, "s3_bucket": "test_csv", # Put this delimiter in as a canary to make sure table params override server params. "delimiter": ",", } tables = { # Pass an empty table schema to denote we want to introspect it "from_url": ([], { "url": url }), "from_s3_rdu": ([], { "s3_object": "some_prefix/rdu-weather-history.csv" }), "from_s3_encoding": ([], { "s3_object": "some_prefix/encoding-win-1252.csv" }), "from_url_broken": ([], { "url": "invalid_url" }), "from_s3_broken": ([], { "s3_object": "invalid_object" }), } source = CSVDataSource( local_engine_empty, credentials, params, tables, ) schema = source.introspect() assert schema == { "from_url": ( mock.ANY, { "autodetect_dialect": False, "url": url, "quotechar": '"', "header": True, "encoding": "utf-8", "delimiter": ";", "autodetect_header": False, "autodetect_encoding": False, }, ), "from_s3_rdu": ( mock.ANY, { "encoding": "utf-8", "autodetect_dialect": False, "autodetect_encoding": False, "autodetect_header": False, "delimiter": ";", "header": True, "quotechar": '"', "s3_object": "some_prefix/rdu-weather-history.csv", }, ), "from_s3_encoding": ( mock.ANY, { "s3_object": "some_prefix/encoding-win-1252.csv", "quotechar": '"', "header": True, "encoding": "Windows-1252", "autodetect_dialect": False, "delimiter": ";", "autodetect_header": False, "autodetect_encoding": False, }, ), "from_url_broken": MountError( table_name="from_url_broken", error="requests.exceptions.MissingSchema", error_text= "Invalid URL 'invalid_url': No schema supplied. Perhaps you meant http://invalid_url?", ), "from_s3_broken": MountError( table_name="from_s3_broken", error="minio.error.S3Error", error_text=mock.ANY, ), } # Mount the datasets with this introspected schema. schema = unwrap(schema)[0] try: source.mount("temp_data", tables=schema) rows = local_engine_empty.run_sql( "SELECT * FROM temp_data.from_s3_encoding") assert len(rows) == 3 assert len(rows[0]) == 3 finally: local_engine_empty.delete_schema("temp_data") # Override the delimiter and blank out the schema for a single table schema["from_s3_encoding"] = ( [], { "s3_object": "some_prefix/encoding-win-1252.csv", "quotechar": '"', "header": True, "encoding": "Windows-1252", "autodetect_dialect": False, # We force a delimiter "," here which will make the CSV a single-column one # (to test we can actually override these) "delimiter": ",", "autodetect_header": False, "autodetect_encoding": False, }, ) # Reintrospect the source with the new table parameters source = CSVDataSource(local_engine_empty, credentials, params, schema) new_schema = source.introspect() assert len(new_schema) == 3 # Check other tables are unchanged assert new_schema["from_url"] == schema["from_url"] assert new_schema["from_s3_rdu"] == schema["from_s3_rdu"] # Table with a changed separator only has one column (since we have , for delimiter # instead of ;) assert new_schema["from_s3_encoding"][0] == [ TableColumn(ordinal=1, name=";DATE;TEXT", pg_type="character varying", is_pk=False, comment=None) ] try: source.mount("temp_data", tables=new_schema) rows = local_engine_empty.run_sql( "SELECT * FROM temp_data.from_s3_encoding") assert len(rows) == 3 # Check we get one column now assert rows[0] == ("1;01/07/2021;Pañamao", ) finally: local_engine_empty.delete_schema("temp_data")
def test_csv_introspection_multiple(): # Test running the introspection passing the table options as CREATE FOREIGN SCHEMA params. # In effect, we specify the table names, S3 key/URL and expect the FDW to figure out # the rest. fdw_options = { "s3_endpoint": "objectstorage:9000", "s3_secure": "false", "s3_access_key": "minioclient", "s3_secret_key": "supersecure", "s3_bucket": "test_csv", "s3_object_prefix": "some_prefix/", } url = MINIO.presigned_get_object("test_csv", "some_prefix/rdu-weather-history.csv") schema = CSVForeignDataWrapper.import_schema( schema=None, srv_options=fdw_options, options={ "table_options": json.dumps({ "from_url": { "url": url }, "from_s3_rdu": { "s3_object": "some_prefix/rdu-weather-history.csv" }, "from_s3_encoding": { "s3_object": "some_prefix/encoding-win-1252.csv" }, }) }, restriction_type=None, restricts=[], ) assert len(schema) == 3 schema = sorted(schema, key=lambda s: s["table_name"]) assert schema[0] == { "table_name": "from_s3_encoding", "schema": None, "columns": mock.ANY, "options": _s3_win_1252_opts, } assert schema[1] == { "table_name": "from_s3_rdu", "schema": None, "columns": mock.ANY, "options": { "autodetect_dialect": "false", "autodetect_encoding": "false", "autodetect_header": "false", "delimiter": ";", "encoding": "utf-8", "header": "true", "quotechar": '"', "s3_object": "some_prefix/rdu-weather-history.csv", }, } assert schema[2] == { "table_name": "from_url", "schema": None, "columns": mock.ANY, "options": { "autodetect_dialect": "false", "autodetect_encoding": "false", "autodetect_header": "false", "delimiter": ";", "encoding": "utf-8", "header": "true", "quotechar": '"', "url": url, }, }