def test_s3_real_aws(): # Exercise connection code with an AWS-backed S3 bucket. # This is a minimal integration check for ARROW-9261 and similar issues. from pyarrow.fs import S3FileSystem fs = S3FileSystem(anonymous=True) entries = fs.get_file_info(FileSelector('ursa-labs-taxi-data')) assert len(entries) > 0
def test_s3_options(monkeypatch): from pyarrow.fs import S3FileSystem # Avoid wait for unavailable metadata server in ARN role example below monkeypatch.setenv("AWS_EC2_METADATA_DISABLED", "true") fs = S3FileSystem(access_key='access', secret_key='secret', session_token='token', region='us-east-2', scheme='https', endpoint_override='localhost:8999') assert isinstance(fs, S3FileSystem) assert fs.region == 'us-east-2' assert pickle.loads(pickle.dumps(fs)) == fs fs = S3FileSystem(role_arn='role', session_name='session', external_id='id', load_frequency=100) assert isinstance(fs, S3FileSystem) assert pickle.loads(pickle.dumps(fs)) == fs with pytest.raises(ValueError): S3FileSystem(access_key='access') with pytest.raises(ValueError): S3FileSystem(secret_key='secret') with pytest.raises(ValueError): S3FileSystem(access_key='access', session_token='token') with pytest.raises(ValueError): S3FileSystem(secret_key='secret', session_token='token') with pytest.raises(ValueError): S3FileSystem(access_key='access', secret_key='secret', role_arn='arn')
def test_s3_options(): from pyarrow.fs import S3FileSystem fs = S3FileSystem(access_key='access', secret_key='secret', session_token='token', region='us-east-2', scheme='https', endpoint_override='localhost:8999') assert isinstance(fs, S3FileSystem) assert fs.region == 'us-east-2' assert pickle.loads(pickle.dumps(fs)) == fs fs = S3FileSystem(role_arn='role', session_name='session', external_id='id', load_frequency=100) assert isinstance(fs, S3FileSystem) assert pickle.loads(pickle.dumps(fs)) == fs with pytest.raises(ValueError): S3FileSystem(access_key='access') with pytest.raises(ValueError): S3FileSystem(secret_key='secret') with pytest.raises(ValueError): S3FileSystem(access_key='access', session_token='token') with pytest.raises(ValueError): S3FileSystem(secret_key='secret', session_token='token') with pytest.raises(ValueError): S3FileSystem(access_key='access', secret_key='secret', role_arn='arn')
def test_s3_real_aws(): # Exercise connection code with an AWS-backed S3 bucket. # This is a minimal integration check for ARROW-9261 and similar issues. from pyarrow.fs import S3FileSystem default_region = (os.environ.get('PYARROW_TEST_S3_REGION') or 'us-east-1') fs = S3FileSystem(anonymous=True) assert fs.region == default_region fs = S3FileSystem(anonymous=True, region='us-east-2') entries = fs.get_file_info(FileSelector('ursa-labs-taxi-data')) assert len(entries) > 0 with fs.open_input_stream('ursa-labs-taxi-data/2019/06/data.parquet') as f: md = f.metadata() assert 'Content-Type' in md assert md['Last-Modified'] == b'2020-01-17T16:26:28Z' # For some reason, the header value is quoted # (both with AWS and Minio) assert md['ETag'] == b'"f1efd5d76cb82861e1542117bfa52b90-8"'
def get_s3_dataset(symbol: str, tick_type: str) -> FileSystemDataset: from pyarrow.fs import S3FileSystem s3 = S3FileSystem( access_key=B2_ACCESS_KEY_ID, secret_key=B2_SECRET_ACCESS_KEY, endpoint_override=B2_ENDPOINT_URL ) ds = dataset( source=S3_PATH + f"/{tick_type}/symbol={symbol}/", format='feather', filesystem=s3, partitioning='hive', exclude_invalid_files=True ) return ds
def main(): parser = argparse.ArgumentParser( description="Generate sample parquet data") parser.add_argument('path', type=str, nargs='?', help='path to save data to', default="./data/data.parquet") parser.add_argument( '--source', type=str, help= 'local path to import data from (optional; can be csv, json or parquet)' ) parser.add_argument( '--endpoint', type=str, help= 'S3 endpoint (e.g.: https://s3.eu-de.cloud-object-storage.appdomain.cloud' ) parser.add_argument('--access_key', type=str, help='S3 access key') parser.add_argument('--secret_key', type=str, help='S3 secret key') args = parser.parse_args() if args.endpoint: print("Using S3 file system") parsed_endpoint = urlparse(args.endpoint) fs = S3FileSystem(endpoint_override=parsed_endpoint.netloc, scheme=parsed_endpoint.scheme, access_key=args.access_key, secret_key=args.secret_key, background_writes=False) else: print("Using local file system") os.makedirs(os.path.dirname(args.path), exist_ok=True) fs = LocalFileSystem() table = import_table(args.source) with fs.open_output_stream(args.path) as f: pq.write_table(table, f) print("Table written to", args.path) print(table.to_pandas())
def s3fs(request, s3_connection, s3_server): request.config.pyarrow.requires('s3') from pyarrow.fs import S3FileSystem host, port, access_key, secret_key = s3_connection bucket = 'pyarrow-filesystem/' fs = S3FileSystem(access_key=access_key, secret_key=secret_key, endpoint_override='{}:{}'.format(host, port), scheme='http') fs.create_dir(bucket) return dict( fs=fs, pathfn=bucket.__add__, allow_copy_file=True, allow_move_dir=False, allow_append_to_file=False, )
def s3fs(request, minio_server): request.config.pyarrow.requires('s3') from pyarrow.fs import S3Options, S3FileSystem address, access_key, secret_key = minio_server bucket = 'pyarrow-filesystem/' options = S3Options(endpoint_override=address, access_key=access_key, secret_key=secret_key, scheme='http') fs = S3FileSystem(options) fs.create_dir(bucket) return dict( fs=fs, pathfn=bucket.__add__, allow_copy_file=True, allow_move_dir=False, allow_append_to_file=False, )
def test_s3_proxy_options(monkeypatch): from pyarrow.fs import S3FileSystem # The following two are equivalent: proxy_opts_1_dict = {'scheme': 'http', 'host': 'localhost', 'port': 8999} proxy_opts_1_str = 'http://localhost:8999' # The following two are equivalent: proxy_opts_2_dict = {'scheme': 'https', 'host': 'localhost', 'port': 8080} proxy_opts_2_str = 'https://localhost:8080' # Check dict case for 'proxy_options' fs = S3FileSystem(proxy_options=proxy_opts_1_dict) assert isinstance(fs, S3FileSystem) assert pickle.loads(pickle.dumps(fs)) == fs fs = S3FileSystem(proxy_options=proxy_opts_2_dict) assert isinstance(fs, S3FileSystem) assert pickle.loads(pickle.dumps(fs)) == fs # Check str case for 'proxy_options' fs = S3FileSystem(proxy_options=proxy_opts_1_str) assert isinstance(fs, S3FileSystem) assert pickle.loads(pickle.dumps(fs)) == fs fs = S3FileSystem(proxy_options=proxy_opts_2_str) assert isinstance(fs, S3FileSystem) assert pickle.loads(pickle.dumps(fs)) == fs # Check that two FSs using the same proxy_options dict are equal fs1 = S3FileSystem(proxy_options=proxy_opts_1_dict) fs2 = S3FileSystem(proxy_options=proxy_opts_1_dict) assert fs1 == fs2 assert pickle.loads(pickle.dumps(fs1)) == fs2 assert pickle.loads(pickle.dumps(fs2)) == fs1 fs1 = S3FileSystem(proxy_options=proxy_opts_2_dict) fs2 = S3FileSystem(proxy_options=proxy_opts_2_dict) assert fs1 == fs2 assert pickle.loads(pickle.dumps(fs1)) == fs2 assert pickle.loads(pickle.dumps(fs2)) == fs1 # Check that two FSs using the same proxy_options str are equal fs1 = S3FileSystem(proxy_options=proxy_opts_1_str) fs2 = S3FileSystem(proxy_options=proxy_opts_1_str) assert fs1 == fs2 assert pickle.loads(pickle.dumps(fs1)) == fs2 assert pickle.loads(pickle.dumps(fs2)) == fs1 fs1 = S3FileSystem(proxy_options=proxy_opts_2_str) fs2 = S3FileSystem(proxy_options=proxy_opts_2_str) assert fs1 == fs2 assert pickle.loads(pickle.dumps(fs1)) == fs2 assert pickle.loads(pickle.dumps(fs2)) == fs1 # Check that two FSs using equivalent proxy_options # (one dict, one str) are equal fs1 = S3FileSystem(proxy_options=proxy_opts_1_dict) fs2 = S3FileSystem(proxy_options=proxy_opts_1_str) assert fs1 == fs2 assert pickle.loads(pickle.dumps(fs1)) == fs2 assert pickle.loads(pickle.dumps(fs2)) == fs1 fs1 = S3FileSystem(proxy_options=proxy_opts_2_dict) fs2 = S3FileSystem(proxy_options=proxy_opts_2_str) assert fs1 == fs2 assert pickle.loads(pickle.dumps(fs1)) == fs2 assert pickle.loads(pickle.dumps(fs2)) == fs1 # Check that two FSs using nonequivalent proxy_options are not equal fs1 = S3FileSystem(proxy_options=proxy_opts_1_dict) fs2 = S3FileSystem(proxy_options=proxy_opts_2_dict) assert fs1 != fs2 assert pickle.loads(pickle.dumps(fs1)) != fs2 assert pickle.loads(pickle.dumps(fs2)) != fs1 fs1 = S3FileSystem(proxy_options=proxy_opts_1_dict) fs2 = S3FileSystem(proxy_options=proxy_opts_2_str) assert fs1 != fs2 assert pickle.loads(pickle.dumps(fs1)) != fs2 assert pickle.loads(pickle.dumps(fs2)) != fs1 fs1 = S3FileSystem(proxy_options=proxy_opts_1_str) fs2 = S3FileSystem(proxy_options=proxy_opts_2_dict) assert fs1 != fs2 assert pickle.loads(pickle.dumps(fs1)) != fs2 assert pickle.loads(pickle.dumps(fs2)) != fs1 fs1 = S3FileSystem(proxy_options=proxy_opts_1_str) fs2 = S3FileSystem(proxy_options=proxy_opts_2_str) assert fs1 != fs2 assert pickle.loads(pickle.dumps(fs1)) != fs2 assert pickle.loads(pickle.dumps(fs2)) != fs1 # Check that two FSs (one using proxy_options and the other not) # are not equal fs1 = S3FileSystem(proxy_options=proxy_opts_1_dict) fs2 = S3FileSystem() assert fs1 != fs2 assert pickle.loads(pickle.dumps(fs1)) != fs2 assert pickle.loads(pickle.dumps(fs2)) != fs1 fs1 = S3FileSystem(proxy_options=proxy_opts_1_str) fs2 = S3FileSystem() assert fs1 != fs2 assert pickle.loads(pickle.dumps(fs1)) != fs2 assert pickle.loads(pickle.dumps(fs2)) != fs1 fs1 = S3FileSystem(proxy_options=proxy_opts_2_dict) fs2 = S3FileSystem() assert fs1 != fs2 assert pickle.loads(pickle.dumps(fs1)) != fs2 assert pickle.loads(pickle.dumps(fs2)) != fs1 fs1 = S3FileSystem(proxy_options=proxy_opts_2_str) fs2 = S3FileSystem() assert fs1 != fs2 assert pickle.loads(pickle.dumps(fs1)) != fs2 assert pickle.loads(pickle.dumps(fs2)) != fs1 # Only dict and str are supported with pytest.raises(TypeError): S3FileSystem(proxy_options=('http', 'localhost', 9090)) # Missing scheme with pytest.raises(KeyError): S3FileSystem(proxy_options={'host': 'localhost', 'port': 9090}) # Missing host with pytest.raises(KeyError): S3FileSystem(proxy_options={'scheme': 'https', 'port': 9090}) # Missing port with pytest.raises(KeyError): S3FileSystem(proxy_options={'scheme': 'http', 'host': 'localhost'}) # Invalid proxy URI (invalid scheme htttps) with pytest.raises(pa.ArrowInvalid): S3FileSystem(proxy_options='htttps://localhost:9000') # Invalid proxy_options dict (invalid scheme htttps) with pytest.raises(pa.ArrowInvalid): S3FileSystem(proxy_options={ 'scheme': 'htttp', 'host': 'localhost', 'port': 8999 })
def main(): args = parse_platform_metrics_calculator_pipeline_arguments(sys.argv[1:]) time_range = _get_time_range(args.year, args.month) organisation_data = read_json_file(args.organisation_list_file) organisation_metadata = construct_organisation_list_from_dict(data=organisation_data) spine_messages = _read_spine_csv_gz_files(args.input_files) transfers = list(parse_transfers_from_messages(spine_messages, time_range)) practice_metrics_data = calculate_practice_metrics_data( transfers, organisation_metadata.practices, time_range ) national_metrics_data = calculate_national_metrics_data( transfers=transfers, time_range=time_range ) organisation_metadata = construct_organisation_metadata(organisation_metadata) transfer_table = convert_transfers_to_table(transfers) practice_metrics_file_name = "practiceMetrics.json" organisation_metadata_file_name = "organisationMetadata.json" national_metrics_file_name = "nationalMetrics.json" transfers_file_name = "transfers.parquet" if _is_outputting_to_file(args): _write_data_platform_json_file( practice_metrics_data, f"{args.output_directory}/{args.month}-{args.year}-{practice_metrics_file_name}", ) _write_data_platform_json_file( organisation_metadata, f"{args.output_directory}/{args.month}-{args.year}-{organisation_metadata_file_name}", ) _write_data_platform_json_file( national_metrics_data, f"{args.output_directory}/{args.month}-{args.year}-{national_metrics_file_name}", ) write_table( transfer_table, f"{args.output_directory}/{args.month}-{args.year}-{transfers_file_name}", ) elif _is_outputting_to_s3(args): s3 = boto3.resource("s3", endpoint_url=args.s3_endpoint_url) bucket_name = args.output_bucket version = "v2" s3_path = f"{version}/{args.year}/{args.month}" _upload_data_platform_json_object( practice_metrics_data, s3.Object(bucket_name, f"{s3_path}/{practice_metrics_file_name}"), ) _upload_data_platform_json_object( organisation_metadata, s3.Object(bucket_name, f"{s3_path}/{organisation_metadata_file_name}"), ) _upload_data_platform_json_object( national_metrics_data, s3.Object(bucket_name, f"{s3_path}/{national_metrics_file_name}"), ) write_table( table=transfer_table, where=bucket_name + "/" + f"{s3_path}/{transfers_file_name}", filesystem=S3FileSystem(endpoint_override=args.s3_endpoint_url), )
def test_s3_options(): from pyarrow.fs import S3FileSystem fs = S3FileSystem(access_key='access', secret_key='secret', session_token='token', region='us-east-2', scheme='https', endpoint_override='localhost:8999') assert isinstance(fs, S3FileSystem) assert fs.region == 'us-east-2' assert pickle.loads(pickle.dumps(fs)) == fs fs = S3FileSystem(role_arn='role', session_name='session', external_id='id', load_frequency=100) assert isinstance(fs, S3FileSystem) assert pickle.loads(pickle.dumps(fs)) == fs fs = S3FileSystem(anonymous=True) assert isinstance(fs, S3FileSystem) assert pickle.loads(pickle.dumps(fs)) == fs fs = S3FileSystem(background_writes=True, default_metadata={ "ACL": "authenticated-read", "Content-Type": "text/plain" }) assert isinstance(fs, S3FileSystem) assert pickle.loads(pickle.dumps(fs)) == fs with pytest.raises(ValueError): S3FileSystem(access_key='access') with pytest.raises(ValueError): S3FileSystem(secret_key='secret') with pytest.raises(ValueError): S3FileSystem(access_key='access', session_token='token') with pytest.raises(ValueError): S3FileSystem(secret_key='secret', session_token='token') with pytest.raises(ValueError): S3FileSystem(access_key='access', secret_key='secret', role_arn='arn') with pytest.raises(ValueError): S3FileSystem(access_key='access', secret_key='secret', anonymous=True) with pytest.raises(ValueError): S3FileSystem(role_arn="arn", anonymous=True) with pytest.raises(ValueError): S3FileSystem(default_metadata=["foo", "bar"])