def to_s3(self, dataframe, path, file_format, database=None, table=None, partition_cols=None, preserve_index=True, mode="append", procs_cpu_bound=None, procs_io_bound=None, cast_columns=None): """ Write a Pandas Dataframe on S3 Optionally writes metadata on AWS Glue. :param dataframe: Pandas Dataframe :param path: AWS S3 path (E.g. s3://bucket-name/folder_name/ :param file_format: "csv" or "parquet" :param database: AWS Glue Database name :param table: AWS Glue table name :param partition_cols: List of columns names that will be partitions on S3 :param preserve_index: Should preserve index on S3? :param mode: "append", "overwrite", "overwrite_partitions" :param procs_cpu_bound: Number of cores used for CPU bound tasks :param procs_io_bound: Number of cores used for I/O bound tasks :param cast_columns: Dictionary of columns indexes and Arrow types to be casted. (E.g. {2: "int64", 5: "int32"}) (Only for "parquet" file_format) :return: List of objects written on S3 """ if dataframe.empty: raise EmptyDataframe() if not partition_cols: partition_cols = [] if mode == "overwrite" or (mode == "overwrite_partitions" and not partition_cols): self._session.s3.delete_objects(path=path) elif mode not in ["overwrite_partitions", "append"]: raise UnsupportedWriteMode(mode) objects_paths = self.data_to_s3(dataframe=dataframe, path=path, partition_cols=partition_cols, preserve_index=preserve_index, file_format=file_format, mode=mode, procs_cpu_bound=procs_cpu_bound, procs_io_bound=procs_io_bound, cast_columns=cast_columns) if database: self._session.glue.metadata_to_glue(dataframe=dataframe, path=path, objects_paths=objects_paths, database=database, table=table, partition_cols=partition_cols, preserve_index=preserve_index, file_format=file_format, mode=mode, cast_columns=cast_columns) return objects_paths
def to_s3(self, dataframe, path, file_format, database=None, table=None, partition_cols=None, preserve_index=True, mode="append", compression=None, procs_cpu_bound=None, procs_io_bound=None, cast_columns=None, extra_args=None, inplace=True): """ Write a Pandas Dataframe on S3 Optionally writes metadata on AWS Glue. :param dataframe: Pandas Dataframe :param path: AWS S3 path (E.g. s3://bucket-name/folder_name/ :param file_format: "csv" or "parquet" :param database: AWS Glue Database name :param table: AWS Glue table name :param partition_cols: List of columns names that will be partitions on S3 :param preserve_index: Should preserve index on S3? :param mode: "append", "overwrite", "overwrite_partitions" :param compression: None, gzip, snappy, etc :param procs_cpu_bound: Number of cores used for CPU bound tasks :param procs_io_bound: Number of cores used for I/O bound tasks :param cast_columns: Dictionary of columns names and Athena/Glue types to be casted. (E.g. {"col name": "bigint", "col2 name": "int"}) (Only for "parquet" file_format) :param extra_args: Extra arguments specific for each file formats (E.g. "sep" for CSV) :param inplace: True is cheapest (CPU and Memory) but False leaves your DataFrame intact :return: List of objects written on S3 """ if not partition_cols: partition_cols = [] if not cast_columns: cast_columns = {} dataframe = Pandas.normalize_columns_names_athena(dataframe, inplace=inplace) cast_columns = {Athena.normalize_column_name(k): v for k, v in cast_columns.items()} logger.debug(f"cast_columns: {cast_columns}") partition_cols = [Athena.normalize_column_name(x) for x in partition_cols] logger.debug(f"partition_cols: {partition_cols}") dataframe = Pandas.drop_duplicated_columns(dataframe=dataframe, inplace=inplace) if compression is not None: compression = compression.lower() file_format = file_format.lower() if file_format == "csv": if compression not in Pandas.VALID_CSV_COMPRESSIONS: raise InvalidCompression( f"{compression} isn't a valid CSV compression. Try: {Pandas.VALID_CSV_COMPRESSIONS}") elif file_format == "parquet": if compression not in Pandas.VALID_PARQUET_COMPRESSIONS: raise InvalidCompression( f"{compression} isn't a valid PARQUET compression. Try: {Pandas.VALID_PARQUET_COMPRESSIONS}") else: raise UnsupportedFileFormat(file_format) if dataframe.empty: raise EmptyDataframe() if ((mode == "overwrite") or ((mode == "overwrite_partitions") and # noqa (not partition_cols))): self._session.s3.delete_objects(path=path) elif mode not in ["overwrite_partitions", "append"]: raise UnsupportedWriteMode(mode) objects_paths = self.data_to_s3(dataframe=dataframe, path=path, partition_cols=partition_cols, preserve_index=preserve_index, file_format=file_format, mode=mode, compression=compression, procs_cpu_bound=procs_cpu_bound, procs_io_bound=procs_io_bound, cast_columns=cast_columns, extra_args=extra_args) if database: self._session.glue.metadata_to_glue(dataframe=dataframe, path=path, objects_paths=objects_paths, database=database, table=table, partition_cols=partition_cols, preserve_index=preserve_index, file_format=file_format, mode=mode, compression=compression, cast_columns=cast_columns, extra_args=extra_args) return objects_paths
def to_s3(self, dataframe, path, file_format, database=None, table=None, partition_cols=None, preserve_index=True, mode="append", compression=None, procs_cpu_bound=None, procs_io_bound=None, cast_columns=None, extra_args=None): """ Write a Pandas Dataframe on S3 Optionally writes metadata on AWS Glue. :param dataframe: Pandas Dataframe :param path: AWS S3 path (E.g. s3://bucket-name/folder_name/ :param file_format: "csv" or "parquet" :param database: AWS Glue Database name :param table: AWS Glue table name :param partition_cols: List of columns names that will be partitions on S3 :param preserve_index: Should preserve index on S3? :param mode: "append", "overwrite", "overwrite_partitions" :param compression: None, gzip, snappy, etc :param procs_cpu_bound: Number of cores used for CPU bound tasks :param procs_io_bound: Number of cores used for I/O bound tasks :param cast_columns: Dictionary of columns indexes and Arrow types to be casted. (E.g. {2: "int64", 5: "int32"}) (Only for "parquet" file_format) :param extra_args: Extra arguments specific for each file formats (E.g. "sep" for CSV) :return: List of objects written on S3 """ if compression is not None: compression = compression.lower() file_format = file_format.lower() if file_format not in ["parquet", "csv"]: raise UnsupportedFileFormat(file_format) if file_format == "csv": if compression not in Pandas.VALID_CSV_COMPRESSIONS: raise InvalidCompression( f"{compression} isn't a valid CSV compression. Try: {Pandas.VALID_CSV_COMPRESSIONS}" ) elif file_format == "parquet": if compression not in Pandas.VALID_PARQUET_COMPRESSIONS: raise InvalidCompression( f"{compression} isn't a valid PARQUET compression. Try: {Pandas.VALID_PARQUET_COMPRESSIONS}" ) if dataframe.empty: raise EmptyDataframe() if not partition_cols: partition_cols = [] if mode == "overwrite" or (mode == "overwrite_partitions" and not partition_cols): self._session.s3.delete_objects(path=path) elif mode not in ["overwrite_partitions", "append"]: raise UnsupportedWriteMode(mode) objects_paths = self.data_to_s3(dataframe=dataframe, path=path, partition_cols=partition_cols, preserve_index=preserve_index, file_format=file_format, mode=mode, compression=compression, procs_cpu_bound=procs_cpu_bound, procs_io_bound=procs_io_bound, cast_columns=cast_columns, extra_args=extra_args) if database: self._session.glue.metadata_to_glue(dataframe=dataframe, path=path, objects_paths=objects_paths, database=database, table=table, partition_cols=partition_cols, preserve_index=preserve_index, file_format=file_format, mode=mode, compression=compression, cast_columns=cast_columns, extra_args=extra_args) return objects_paths