Пример #1
0
    def to_s3(self,
              dataframe,
              path,
              file_format,
              database=None,
              table=None,
              partition_cols=None,
              preserve_index=True,
              mode="append",
              procs_cpu_bound=None,
              procs_io_bound=None,
              cast_columns=None):
        """
        Write a Pandas Dataframe on S3
        Optionally writes metadata on AWS Glue.

        :param dataframe: Pandas Dataframe
        :param path: AWS S3 path (E.g. s3://bucket-name/folder_name/
        :param file_format: "csv" or "parquet"
        :param database: AWS Glue Database name
        :param table: AWS Glue table name
        :param partition_cols: List of columns names that will be partitions on S3
        :param preserve_index: Should preserve index on S3?
        :param mode: "append", "overwrite", "overwrite_partitions"
        :param procs_cpu_bound: Number of cores used for CPU bound tasks
        :param procs_io_bound: Number of cores used for I/O bound tasks
        :param cast_columns: Dictionary of columns indexes and Arrow types to be casted. (E.g. {2: "int64", 5: "int32"}) (Only for "parquet" file_format)
        :return: List of objects written on S3
        """
        if dataframe.empty:
            raise EmptyDataframe()
        if not partition_cols:
            partition_cols = []
        if mode == "overwrite" or (mode == "overwrite_partitions"
                                   and not partition_cols):
            self._session.s3.delete_objects(path=path)
        elif mode not in ["overwrite_partitions", "append"]:
            raise UnsupportedWriteMode(mode)
        objects_paths = self.data_to_s3(dataframe=dataframe,
                                        path=path,
                                        partition_cols=partition_cols,
                                        preserve_index=preserve_index,
                                        file_format=file_format,
                                        mode=mode,
                                        procs_cpu_bound=procs_cpu_bound,
                                        procs_io_bound=procs_io_bound,
                                        cast_columns=cast_columns)
        if database:
            self._session.glue.metadata_to_glue(dataframe=dataframe,
                                                path=path,
                                                objects_paths=objects_paths,
                                                database=database,
                                                table=table,
                                                partition_cols=partition_cols,
                                                preserve_index=preserve_index,
                                                file_format=file_format,
                                                mode=mode,
                                                cast_columns=cast_columns)
        return objects_paths
Пример #2
0
    def to_s3(self,
              dataframe,
              path,
              file_format,
              database=None,
              table=None,
              partition_cols=None,
              preserve_index=True,
              mode="append",
              compression=None,
              procs_cpu_bound=None,
              procs_io_bound=None,
              cast_columns=None,
              extra_args=None,
              inplace=True):
        """
        Write a Pandas Dataframe on S3
        Optionally writes metadata on AWS Glue.

        :param dataframe: Pandas Dataframe
        :param path: AWS S3 path (E.g. s3://bucket-name/folder_name/
        :param file_format: "csv" or "parquet"
        :param database: AWS Glue Database name
        :param table: AWS Glue table name
        :param partition_cols: List of columns names that will be partitions on S3
        :param preserve_index: Should preserve index on S3?
        :param mode: "append", "overwrite", "overwrite_partitions"
        :param compression: None, gzip, snappy, etc
        :param procs_cpu_bound: Number of cores used for CPU bound tasks
        :param procs_io_bound: Number of cores used for I/O bound tasks
        :param cast_columns: Dictionary of columns names and Athena/Glue types to be casted. (E.g. {"col name": "bigint", "col2 name": "int"}) (Only for "parquet" file_format)
        :param extra_args: Extra arguments specific for each file formats (E.g. "sep" for CSV)
        :param inplace: True is cheapest (CPU and Memory) but False leaves your DataFrame intact
        :return: List of objects written on S3
        """
        if not partition_cols:
            partition_cols = []
        if not cast_columns:
            cast_columns = {}
        dataframe = Pandas.normalize_columns_names_athena(dataframe, inplace=inplace)
        cast_columns = {Athena.normalize_column_name(k): v for k, v in cast_columns.items()}
        logger.debug(f"cast_columns: {cast_columns}")
        partition_cols = [Athena.normalize_column_name(x) for x in partition_cols]
        logger.debug(f"partition_cols: {partition_cols}")
        dataframe = Pandas.drop_duplicated_columns(dataframe=dataframe, inplace=inplace)
        if compression is not None:
            compression = compression.lower()
        file_format = file_format.lower()
        if file_format == "csv":
            if compression not in Pandas.VALID_CSV_COMPRESSIONS:
                raise InvalidCompression(
                    f"{compression} isn't a valid CSV compression. Try: {Pandas.VALID_CSV_COMPRESSIONS}")
        elif file_format == "parquet":
            if compression not in Pandas.VALID_PARQUET_COMPRESSIONS:
                raise InvalidCompression(
                    f"{compression} isn't a valid PARQUET compression. Try: {Pandas.VALID_PARQUET_COMPRESSIONS}")
        else:
            raise UnsupportedFileFormat(file_format)
        if dataframe.empty:
            raise EmptyDataframe()
        if ((mode == "overwrite") or ((mode == "overwrite_partitions") and  # noqa
                                      (not partition_cols))):
            self._session.s3.delete_objects(path=path)
        elif mode not in ["overwrite_partitions", "append"]:
            raise UnsupportedWriteMode(mode)
        objects_paths = self.data_to_s3(dataframe=dataframe,
                                        path=path,
                                        partition_cols=partition_cols,
                                        preserve_index=preserve_index,
                                        file_format=file_format,
                                        mode=mode,
                                        compression=compression,
                                        procs_cpu_bound=procs_cpu_bound,
                                        procs_io_bound=procs_io_bound,
                                        cast_columns=cast_columns,
                                        extra_args=extra_args)
        if database:
            self._session.glue.metadata_to_glue(dataframe=dataframe,
                                                path=path,
                                                objects_paths=objects_paths,
                                                database=database,
                                                table=table,
                                                partition_cols=partition_cols,
                                                preserve_index=preserve_index,
                                                file_format=file_format,
                                                mode=mode,
                                                compression=compression,
                                                cast_columns=cast_columns,
                                                extra_args=extra_args)
        return objects_paths
Пример #3
0
    def to_s3(self,
              dataframe,
              path,
              file_format,
              database=None,
              table=None,
              partition_cols=None,
              preserve_index=True,
              mode="append",
              compression=None,
              procs_cpu_bound=None,
              procs_io_bound=None,
              cast_columns=None,
              extra_args=None):
        """
        Write a Pandas Dataframe on S3
        Optionally writes metadata on AWS Glue.

        :param dataframe: Pandas Dataframe
        :param path: AWS S3 path (E.g. s3://bucket-name/folder_name/
        :param file_format: "csv" or "parquet"
        :param database: AWS Glue Database name
        :param table: AWS Glue table name
        :param partition_cols: List of columns names that will be partitions on S3
        :param preserve_index: Should preserve index on S3?
        :param mode: "append", "overwrite", "overwrite_partitions"
        :param compression: None, gzip, snappy, etc
        :param procs_cpu_bound: Number of cores used for CPU bound tasks
        :param procs_io_bound: Number of cores used for I/O bound tasks
        :param cast_columns: Dictionary of columns indexes and Arrow types to be casted. (E.g. {2: "int64", 5: "int32"}) (Only for "parquet" file_format)
        :param extra_args: Extra arguments specific for each file formats (E.g. "sep" for CSV)
        :return: List of objects written on S3
        """
        if compression is not None:
            compression = compression.lower()
        file_format = file_format.lower()
        if file_format not in ["parquet", "csv"]:
            raise UnsupportedFileFormat(file_format)
        if file_format == "csv":
            if compression not in Pandas.VALID_CSV_COMPRESSIONS:
                raise InvalidCompression(
                    f"{compression} isn't a valid CSV compression. Try: {Pandas.VALID_CSV_COMPRESSIONS}"
                )
        elif file_format == "parquet":
            if compression not in Pandas.VALID_PARQUET_COMPRESSIONS:
                raise InvalidCompression(
                    f"{compression} isn't a valid PARQUET compression. Try: {Pandas.VALID_PARQUET_COMPRESSIONS}"
                )
        if dataframe.empty:
            raise EmptyDataframe()
        if not partition_cols:
            partition_cols = []
        if mode == "overwrite" or (mode == "overwrite_partitions"
                                   and not partition_cols):
            self._session.s3.delete_objects(path=path)
        elif mode not in ["overwrite_partitions", "append"]:
            raise UnsupportedWriteMode(mode)
        objects_paths = self.data_to_s3(dataframe=dataframe,
                                        path=path,
                                        partition_cols=partition_cols,
                                        preserve_index=preserve_index,
                                        file_format=file_format,
                                        mode=mode,
                                        compression=compression,
                                        procs_cpu_bound=procs_cpu_bound,
                                        procs_io_bound=procs_io_bound,
                                        cast_columns=cast_columns,
                                        extra_args=extra_args)
        if database:
            self._session.glue.metadata_to_glue(dataframe=dataframe,
                                                path=path,
                                                objects_paths=objects_paths,
                                                database=database,
                                                table=table,
                                                partition_cols=partition_cols,
                                                preserve_index=preserve_index,
                                                file_format=file_format,
                                                mode=mode,
                                                compression=compression,
                                                cast_columns=cast_columns,
                                                extra_args=extra_args)
        return objects_paths