示例#1
0
    def __s3_file_system(self):
        from pyarrow import fs

        connection = self.connection
        if "role_arn" in connection._kwargs and connection._kwargs["role_arn"]:
            external_id = connection._kwargs.get("external_id", None)
            fs = fs.S3FileSystem(
                role_arn=connection._kwargs["role_arn"],
                session_name=connection._kwargs["role_session_name"],
                external_id="" if external_id is None else external_id,
                load_frequency=connection._kwargs["duration_seconds"],
                region=connection.region_name,
            )
        elif connection.profile_name:
            profile = connection.session._session.full_config["profiles"][
                connection.profile_name]
            fs = fs.S3FileSystem(
                access_key=profile.get("aws_access_key_id", None),
                secret_key=profile.get("aws_secret_access_key", None),
                session_token=profile.get("aws_session_token", None),
                region=connection.region_name,
            )
        else:
            fs = fs.S3FileSystem(
                access_key=connection._kwargs.get("aws_access_key_id", None),
                secret_key=connection._kwargs.get("aws_secret_access_key",
                                                  None),
                session_token=connection._kwargs.get("aws_session_token",
                                                     None),
                region=connection.region_name,
            )
        return fs
示例#2
0
def main2():
    # By default, MinIO will listen for unencrypted HTTP traffic.
    minio = fs.S3FileSystem(scheme="http", endpoint_override="10.0.0.2:9000")

    # List all contents in a bucket, recursively
    file_selector = fs.FileSelector('customer-data-text', recursive=True)
    print_file_info(minio, file_selector)

    print(read_pafs_file(minio, 'customer-data-text/customer.csv'))
    print(read_pafs_stream(minio, 'customer-data-text/customer.csv'))

    endpoint_url = 'http://10.0.0.2:9000'
    print_boto3_buckets(endpoint_url)

    # TODO: read multiple files using dataset

    # https://stackoverflow.com/questions/45082832/how-to-read-partitioned-parquet-files-from-s3-using-pyarrow-in-python
    file_system = get_s3fs()
    print(file_system.ls('example-data'))

    bucket_uri = 's3://example-data/external-data'
    print_parquet_pandas_shape(bucket_uri, file_system)
    print_parquet_dataset_info(bucket_uri, file_system, verbose=False)

    bucket_uri = 's3://example-data/external-clustered'
    print_parquet_pandas_shape(bucket_uri, file_system)
    print_parquet_dataset_info(bucket_uri, file_system, verbose=False)
示例#3
0
 def scan_file(self, bucket, key, schema):
     logging.info(f"delim is {self.delimiter}")
     uri = f"{bucket}/{key}"
     s3fs = fs.S3FileSystem()
     # Run column order validation by opening and not reading anything.
     filestream = s3fs.open_input_stream(uri)
     parse_opts = csv.ParseOptions(delimiter=self.delimiter)
     reader = csv.open_csv(filestream, parse_options=parse_opts)
     for index, col in enumerate(reader.schema):
         if col.name != schema[index].name:
             msg = "column {} is out of order".format(col.name)
             raise ColumnOrderException(msg)
     # Run the rest of the validations.
     filestream = s3fs.open_input_stream(uri)
     opts = csv.ConvertOptions(column_types=schema)
     reader = csv.open_csv(filestream,
                           convert_options=opts,
                           parse_options=parse_opts)
     # Kind of a hack, but it works...if delim wrong, everything is read
     # as one column.
     if len(schema) > 1 and len(reader.schema) == 1:
         raise WrongDelimiterException()
     # Parse through the file, pyarrow will through exceptions
     # if there's invalid data.
     for batch in reader:
         # If primary key is a string, need to check the column
         # for empty strings.
         if schema.field(self.primary_key).type == "string":
             table = pyarrow.Table.from_batches([batch])
             for val in table[self.primary_key]:
                 if val.as_py() == "":
                     raise EmptyPrimaryKeyException()
示例#4
0
def test_read_csv_arrow_nativefile(s3_base, s3so, pdf):
    # Write to buffer
    fname = "test_csv_reader_arrow_nativefile.csv"
    bname = "csv"
    buffer = pdf.to_csv(index=False)
    with s3_context(s3_base=s3_base, bucket=bname, files={fname: buffer}):
        fs = pa_fs.S3FileSystem(
            endpoint_override=s3so["client_kwargs"]["endpoint_url"], )
        with fs.open_input_file(f"{bname}/{fname}") as fil:
            got = cudf.read_csv(fil)

    assert_eq(pdf, got)
示例#5
0
def test_read_parquet_arrow_nativefile(s3_base, s3so, pdf, columns):
    # Write to buffer
    fname = "test_parquet_reader_arrow_nativefile.parquet"
    bname = "parquet"
    buffer = BytesIO()
    pdf.to_parquet(path=buffer)
    buffer.seek(0)
    with s3_context(s3_base=s3_base, bucket=bname, files={fname: buffer}):
        fs = pa_fs.S3FileSystem(
            endpoint_override=s3so["client_kwargs"]["endpoint_url"], )
        with fs.open_input_file(f"{bname}/{fname}") as fil:
            got = cudf.read_parquet(fil, columns=columns)

    expect = pdf[columns] if columns else pdf
    assert_eq(expect, got)
示例#6
0
def test_read_orc_arrow_nativefile(s3_base, s3so, datadir, columns):
    source_file = str(datadir / "orc" / "TestOrcFile.testSnappy.orc")
    fname = "test_orc_reader.orc"
    bname = "orc"
    expect = pa.orc.ORCFile(source_file).read().to_pandas()

    with open(source_file, "rb") as f:
        buffer = f.read()

    with s3_context(s3_base=s3_base, bucket=bname, files={fname: buffer}):
        fs = pa_fs.S3FileSystem(
            endpoint_override=s3so["client_kwargs"]["endpoint_url"], )
        with fs.open_input_file(f"{bname}/{fname}") as fil:
            got = cudf.read_orc(fil, columns=columns)

    if columns:
        expect = expect[columns]
    assert_eq(expect, got)
示例#7
0
文件: test_s3.py 项目: rongou/cudf
def test_read_parquet(s3_base, s3so, open_file_options):
    pdf = pd.DataFrame({"a": [1, 2, 3, 4], "b": [2.1, 2.2, 2.3, 2.4]})
    buffer = BytesIO()
    pdf.to_parquet(path=buffer)
    buffer.seek(0)
    with s3_context(s3_base=s3_base,
                    bucket="daskparquet",
                    files={"file.parq": buffer}):
        if "open_file_func" in open_file_options:
            fs = pa_fs.S3FileSystem(
                endpoint_override=s3so["client_kwargs"]["endpoint_url"], )
            open_file_options["open_file_func"] = fs.open_input_file
        df = dask_cudf.read_parquet(
            "s3://daskparquet/*.parq",
            storage_options=s3so,
            open_file_options=open_file_options,
        )
        assert df.a.sum().compute() == 10
        assert df.b.sum().compute() == 9
示例#8
0
文件: writer.py 项目: Mu-L/airbyte
 def __init__(self, connection: Connection, s3_bucket: str, access_key: str,
              secret_key: str, s3_region: str) -> None:
     """
     :param connection: Firebolt SDK connection class with established connection
         to the databse.
     :param s3_bucket: Intermediate bucket to store the data files before writing them to Firebolt.
         Has to be created and accessible.
     :param access_key: AWS Access Key ID that has read/write/delete permissions on the files in the bucket.
     :param secret_key: Corresponding AWS Secret Key.
     :param s3_region: S3 region. Best to keep this the same as Firebolt database region. Default us-east-1.
     """
     super().__init__(connection)
     self.key_id = access_key
     self.secret_key = secret_key
     self.s3_bucket = s3_bucket
     self._updated_tables = set()
     self.unique_dir = f"{int(time())}_{uuid4()}"
     self.fs = fs.S3FileSystem(access_key=access_key,
                               secret_key=secret_key,
                               region=s3_region)
def read_csv_write_to_parquet(local_data_path, s3_path, local_meta_path):

    if s3_path.startswith("s3://"):
        s3_path = s3_path.replace("s3://", "", 1)

    local = fs.LocalFileSystem()
    s3 = fs.S3FileSystem(region=REGION)
    with local.open_input_stream(local_data_path) as f:
        tab = csv.read_csv(f)

    metadata = read_table_json(local_meta_path)
    arrow_cols = []
    for col in metadata.columns:
        if col["name"] not in metadata.partitions:
            arrow_cols.append(convert_meta_col_to_arrow_tuple(col))

    s = pa.schema(arrow_cols)
    tab = tab.cast(s)

    with s3.open_output_stream(s3_path) as f:
        pq.write_table(tab, f)
示例#10
0
def s3_file_stream(s3_bucket, s3_file_path, s3_region):
    s3_client = fs.S3FileSystem(region=s3_region, anonymous=True)
    return s3_client.open_input_stream("{}/{}".format(s3_bucket, s3_file_path))
示例#11
0
 def read_parquet_with_pyarrow():
     # https://issues.apache.org/jira/browse/ARROW-8832
     # It does not work :(
     s3 = fs.S3FileSystem(region='eu-west-3')
from os import environ
import pandas as pd
import pyarrow as pa
from pyarrow import fs
import pyarrow.dataset as ds


s3  = fs.S3FileSystem(
    access_key=environ['B2_ACCESS_KEY_ID'],
    secret_key=environ['B2_SECRET_ACCESS_KEY'],
    endpoint_override=environ['B2_ENDPOINT_URL']
)

dataset = ds.dataset(
    source='polygon-equities/data/trades',
    format='feather',
    filesystem=s3,
    partitioning='hive',
    exclude_invalid_files=True
)

df = dataset.to_table(
    # columns=['symbol', 'sip_epoch', 'price', 'size'],
    filter=ds.field('date') == '2020-07-01'
).to_pandas()


# local
dataset = ds.dataset(
    source='/Users/bobcolner/QuantClarity/data/trades/feather/',
    format='feather',
示例#13
0
 def setUpClass(cls):
     # this prevents stupid requests to 169.254.169.254 which take a while
     os.environ["AWS_EC2_METADATA_DISABLED"] = "true"
     cls.s3 = fs.S3FileSystem(endpoint_override="127.0.0.1:3000",
                              scheme="http",
                              anonymous=True)
示例#14
0
def _parse_data_to_pandas(filepath: str, table_params: dict, metadata: dict):
    """
    Reads in the data from the given filepath and returns
    a dataframe
    """

    meta_col_names = [
        c["name"] for c in metadata["columns"]
        if c["name"] not in metadata.get("partitions", [])
    ]

    # For string based file types convert make arrow readers read them in as strings
    # validators will still treat these as dates but will run validation against strings
    # cols expecting values to match a timestamp format
    if "json" in metadata["file_format"] or "csv" in metadata["file_format"]:
        md_obj = Metadata.from_dict(metadata)
        cols = md_obj.columns

        cols_to_force_str_read_in = []
        for c in cols:
            if c["type"].startswith("time") or c["type"].startswith("date"):
                c["type"] = "string"
                c["type_category"] = "string"
                cols_to_force_str_read_in.append(c["name"])

        md_obj.columns = cols
        ac = ArrowConverter()
        arrow_schema = ac.generate_from_meta(md_obj)

        ts_as_str_schema = pa.schema([])
        for cname in cols_to_force_str_read_in:
            ts_as_str_schema = ts_as_str_schema.append(
                arrow_schema.field(cname))

    # Set the reader type
    if filepath.startswith("s3://"):
        reader_fs = fs.S3FileSystem(region="eu-west-1")
        fp_for_file_reader = filepath.replace("s3://", "", 1)

    else:
        reader_fs = fs.LocalFileSystem()
        fp_for_file_reader = filepath

    with reader_fs.open_input_stream(fp_for_file_reader) as f:
        if "csv" in metadata["file_format"]:

            # Safer CSV load for newlines_in_values set to True
            if table_params.get("expect-header", True):
                po = csv.ParseOptions(newlines_in_values=True)
            else:
                po = csv.ParseOptions(newlines_in_values=True,
                                      column_names=meta_col_names)

            if ts_as_str_schema:
                co = csv.ConvertOptions(column_types=ts_as_str_schema)
            else:
                co = None

            df = pa_read_csv_to_pandas(
                input_file=f,
                schema=arrow_schema,
                expect_full_schema=False,
                parse_options=po,
                convert_options=co,
            )
            # dates/datetimes == string

        elif "json" in metadata["file_format"]:

            po = json.ParseOptions(
                newlines_in_values=True,
                explicit_schema=ts_as_str_schema if ts_as_str_schema else None,
            )

            df = pa_read_json_to_pandas(
                input_file=f,
                schema=arrow_schema,
                expect_full_schema=False,
                parse_options=po,
            )
            # dates/datetimes == string

        elif "parquet" in metadata["file_format"]:
            df = arrow_to_pandas(pq.read_table(f))
            # dates/datetimes == datetime / date

        else:
            raise ValueError(
                f"Unknown file_format in metadata: {metadata['file_format']}.")

    if table_params.get("row-limit"):
        df = df.sample(table_params.get("row-limit"))

    if table_params.get("headers-ignore-case"):
        df_cols = [c.lower() for c in df.columns]
        df.columns = df_cols

    if table_params.get("only-test-cols-in-metadata", False):
        keep_cols = [c for c in df.columns if c in meta_col_names]
        df = df[keep_cols]

    return df