def test_is_valid(self): self.assertTrue(AthenaCompression.is_valid('snappy')) self.assertFalse(AthenaCompression.is_valid(None)) self.assertFalse(AthenaCompression.is_valid('')) self.assertFalse(AthenaCompression.is_valid('foobar'))
def to_sql(df, name, conn, location, schema='default', index=False, index_label=None, chunksize=None, if_exists='fail', compression=None, flavor='spark', type_mappings=to_sql_type_mappings): # TODO Supports orc, avro, json, csv or tsv format # TODO Supports partitioning if if_exists not in ('fail', 'replace', 'append'): raise ValueError('`{0}` is not valid for if_exists'.format(if_exists)) if compression is not None and not AthenaCompression.is_valid(compression): raise ValueError( '`{0}` is not valid for compression'.format(compression)) import pyarrow as pa import pyarrow.parquet as pq bucket_name, key_prefix = parse_output_location(location) bucket = conn.session.resource('s3', region_name=conn.region_name, **conn._client_kwargs).Bucket(bucket_name) cursor = conn.cursor() retry_config = conn.retry_config table = cursor.execute(""" SELECT table_name FROM information_schema.tables WHERE table_schema = '{schema}' AND table_name = '{table}' """.format(schema=schema, table=name)).fetchall() if if_exists == 'fail': if table: raise OperationalError('Table `{0}.{1}` already exists.'.format( schema, name)) elif if_exists == 'replace': if table: cursor.execute(""" DROP TABLE {schema}.{table} """.format(schema=schema, table=name)) objects = bucket.objects.filter(Prefix=key_prefix) if list(objects.limit(1)): objects.delete() if index: reset_index(df, index_label) for chunk in get_chunks(df, chunksize): table = pa.Table.from_pandas(chunk) buf = pa.BufferOutputStream() pq.write_table(table, buf, compression=compression, flavor=flavor) retry_api_call(bucket.put_object, config=retry_config, Body=buf.getvalue().to_pybytes(), Key=key_prefix + str(uuid.uuid4())) ddl = generate_ddl(df=df, name=name, location=location, schema=schema, compression=compression, type_mappings=type_mappings) cursor.execute(ddl)
def test_is_valid(self): self.assertTrue(AthenaCompression.is_valid("snappy")) self.assertFalse(AthenaCompression.is_valid(None)) self.assertFalse(AthenaCompression.is_valid("")) self.assertFalse(AthenaCompression.is_valid("foobar"))
def to_sql( df, name, conn, location, schema="default", index=False, index_label=None, partitions=None, chunksize=None, if_exists="fail", compression=None, flavor="spark", type_mappings=to_sql_type_mappings, executor_class=ThreadPoolExecutor, max_workers=(cpu_count() or 1) * 5, ): # TODO Supports orc, avro, json, csv or tsv format if if_exists not in ("fail", "replace", "append"): raise ValueError("`{0}` is not valid for if_exists".format(if_exists)) if compression is not None and not AthenaCompression.is_valid(compression): raise ValueError("`{0}` is not valid for compression".format(compression)) if partitions is None: partitions = [] bucket_name, key_prefix = parse_output_location(location) bucket = conn.session.resource( "s3", region_name=conn.region_name, **conn._client_kwargs ).Bucket(bucket_name) cursor = conn.cursor() table = cursor.execute( """ SELECT table_name FROM information_schema.tables WHERE table_schema = '{schema}' AND table_name = '{table}' """.format( schema=schema, table=name ) ).fetchall() if if_exists == "fail": if table: raise OperationalError( "Table `{0}.{1}` already exists.".format(schema, name) ) elif if_exists == "replace": if table: cursor.execute( """ DROP TABLE {schema}.{table} """.format( schema=schema, table=name ) ) objects = bucket.objects.filter(Prefix=key_prefix) if list(objects.limit(1)): objects.delete() if index: reset_index(df, index_label) with executor_class(max_workers=max_workers) as e: futures = [] session_kwargs = deepcopy(conn._session_kwargs) session_kwargs.update({"profile_name": conn.profile_name}) client_kwargs = deepcopy(conn._client_kwargs) client_kwargs.update({"region_name": conn.region_name}) if partitions: for keys, group in df.groupby(by=partitions, observed=True): keys = keys if isinstance(keys, tuple) else (keys,) group = group.drop(partitions, axis=1) partition_prefix = "/".join( ["{0}={1}".format(key, val) for key, val in zip(partitions, keys)] ) for chunk in get_chunks(group, chunksize): futures.append( e.submit( to_parquet, chunk, bucket_name, "{0}{1}/".format(key_prefix, partition_prefix), conn._retry_config, session_kwargs, client_kwargs, compression, flavor, ) ) else: for chunk in get_chunks(df, chunksize): futures.append( e.submit( to_parquet, chunk, bucket_name, key_prefix, conn._retry_config, session_kwargs, client_kwargs, compression, flavor, ) ) for future in concurrent.futures.as_completed(futures): result = future.result() _logger.info("to_parquet: {0}".format(result)) ddl = generate_ddl( df=df, name=name, location=location, schema=schema, partitions=partitions, compression=compression, type_mappings=type_mappings, ) _logger.info(ddl) cursor.execute(ddl) if partitions: repair = "MSCK REPAIR TABLE {0}.{1}".format(schema, name) _logger.info(repair) cursor.execute(repair)
def test_is_valid(self): assert AthenaCompression.is_valid("snappy") assert AthenaCompression.is_valid("SNAPPY") assert not AthenaCompression.is_valid("") assert not AthenaCompression.is_valid("foobar")
def to_sql( df: "DataFrame", name: str, conn: "Connection", location: str, schema: str = "default", index: bool = False, index_label: Optional[str] = None, partitions: List[str] = None, chunksize: Optional[int] = None, if_exists: str = "fail", compression: str = None, flavor: str = "spark", type_mappings: Callable[["Series"], str] = to_sql_type_mappings, executor_class: Type[Union[ThreadPoolExecutor, ProcessPoolExecutor]] = ThreadPoolExecutor, max_workers: int = (cpu_count() or 1) * 5, repair_table=True, ) -> None: # TODO Supports orc, avro, json, csv or tsv format if if_exists not in ("fail", "replace", "append"): raise ValueError(f"`{if_exists}` is not valid for if_exists") if compression is not None and not AthenaCompression.is_valid(compression): raise ValueError(f"`{compression}` is not valid for compression") if partitions is None: partitions = [] if not location.endswith("/"): location += "/" bucket_name, key_prefix = parse_output_location(location) bucket = conn.session.resource("s3", region_name=conn.region_name, **conn._client_kwargs).Bucket(bucket_name) cursor = conn.cursor() table = cursor.execute( textwrap.dedent(f""" SELECT table_name FROM information_schema.tables WHERE table_schema = '{schema}' AND table_name = '{name}' """)).fetchall() if if_exists == "fail": if table: raise OperationalError(f"Table `{schema}.{name}` already exists.") elif if_exists == "replace": if table: cursor.execute( textwrap.dedent(f""" DROP TABLE {schema}.{name} """)) objects = bucket.objects.filter(Prefix=key_prefix) if list(objects.limit(1)): objects.delete() if index: reset_index(df, index_label) with executor_class(max_workers=max_workers) as e: futures = [] session_kwargs = deepcopy(conn._session_kwargs) session_kwargs.update({"profile_name": conn.profile_name}) client_kwargs = deepcopy(conn._client_kwargs) client_kwargs.update({"region_name": conn.region_name}) partition_prefixes = [] if partitions: for keys, group in df.groupby(by=partitions, observed=True): keys = keys if isinstance(keys, tuple) else (keys, ) group = group.drop(partitions, axis=1) partition_prefix = "/".join( [f"{key}={val}" for key, val in zip(partitions, keys)]) partition_prefixes.append(( ", ".join([ f"`{key}` = '{val}'" for key, val in zip(partitions, keys) ]), f"{location}{partition_prefix}/", )) for chunk in get_chunks(group, chunksize): futures.append( e.submit( to_parquet, chunk, bucket_name, f"{key_prefix}{partition_prefix}/", conn._retry_config, session_kwargs, client_kwargs, compression, flavor, )) else: for chunk in get_chunks(df, chunksize): futures.append( e.submit( to_parquet, chunk, bucket_name, key_prefix, conn._retry_config, session_kwargs, client_kwargs, compression, flavor, )) for future in concurrent.futures.as_completed(futures): result = future.result() _logger.info(f"to_parquet: {result}") ddl = generate_ddl( df=df, name=name, location=location, schema=schema, partitions=partitions, compression=compression, type_mappings=type_mappings, ) _logger.info(ddl) cursor.execute(ddl) if partitions and repair_table: for partition in partition_prefixes: add_partition = textwrap.dedent(f""" ALTER TABLE `{schema}`.`{name}` ADD IF NOT EXISTS PARTITION ({partition[0]}) LOCATION '{partition[1]}' """) _logger.info(add_partition) cursor.execute(add_partition)