Пример #1
0
 def add_partitions(self, database, table, partition_paths, file_format, compression, extra_args=None):
     if not partition_paths:
         return None
     partitions = list()
     for partition in partition_paths:
         if file_format == "parquet":
             partition_def = Glue.parquet_partition_definition(partition=partition, compression=compression)
         elif file_format == "csv":
             partition_def = Glue.csv_partition_definition(partition=partition,
                                                           compression=compression,
                                                           extra_args=extra_args)
         else:
             raise UnsupportedFileFormat(file_format)
         partitions.append(partition_def)
     pages_num = int(ceil(len(partitions) / 100.0))
     for _ in range(pages_num):
         page = partitions[:100]
         del partitions[:100]
         res = self._client_glue.batch_create_partition(DatabaseName=database,
                                                        TableName=table,
                                                        PartitionInputList=page)
         for error in res["Errors"]:
             if "ErrorDetail" in error:
                 if "ErrorCode" in error["ErrorDetail"]:
                     if error["ErrorDetail"]["ErrorCode"] != "AlreadyExistsException":
                         raise ApiError(f"{error}")
Пример #2
0
 def _data_to_s3_object_writer(dataframe, path, preserve_index,
                               session_primitives, file_format):
     fs = s3.get_fs(session_primitives=session_primitives)
     fs = pyarrow.filesystem._ensure_filesystem(fs)
     s3.mkdir_if_not_exists(fs, path)
     if file_format == "parquet":
         outfile = pyarrow.compat.guid() + ".parquet"
     elif file_format == "csv":
         outfile = pyarrow.compat.guid() + ".csv"
     else:
         raise UnsupportedFileFormat(file_format)
     object_path = "/".join([path, outfile])
     if file_format == "parquet":
         Pandas.write_parquet_dataframe(
             dataframe=dataframe,
             path=object_path,
             preserve_index=preserve_index,
             fs=fs,
         )
     elif file_format == "csv":
         Pandas.write_csv_dataframe(
             dataframe=dataframe,
             path=object_path,
             preserve_index=preserve_index,
             fs=fs,
         )
     return object_path
Пример #3
0
    def create_table(
            self,
            database,
            table,
            schema,
            path,
            file_format,
            compression,
            partition_cols_schema=None,
            extra_args=None,
            description: Optional[str] = None,
            parameters: Optional[Dict[str, str]] = None,
            columns_comments: Optional[Dict[str, str]] = None) -> None:
        """
        Create Glue table (Catalog)

        :param database: AWS Glue Database name
        :param table: AWS Glue table name
        :param schema: Table schema
        :param path: AWS S3 path (E.g. s3://bucket-name/folder_name/
        :param file_format: "csv" or "parquet"
        :param compression: None, gzip, snappy, etc
        :param partition_cols_schema: Partitions schema
        :param extra_args: Extra arguments specific for each file formats (E.g. "sep" for CSV)
        :param description: Table description
        :param parameters: Key/value pairs to tag the table (Optional[Dict[str, str]])
        :param columns_comments: Columns names and the related comments (Optional[Dict[str, str]])
        :return: None
        """
        if file_format == "parquet":
            table_input = Glue.parquet_table_definition(
                table, partition_cols_schema, schema, path, compression)
        elif file_format == "csv":
            table_input = Glue.csv_table_definition(table,
                                                    partition_cols_schema,
                                                    schema,
                                                    path,
                                                    compression,
                                                    extra_args=extra_args)
        else:
            raise UnsupportedFileFormat(file_format)
        if description is not None:
            table_input["Description"] = description
        if parameters is not None:
            for k, v in parameters.items():
                table_input["Parameters"][k] = v
        if columns_comments is not None:
            for col in table_input["StorageDescriptor"]["Columns"]:
                name = col["Name"]
                if name in columns_comments:
                    col["Comment"] = columns_comments[name]
            for par in table_input["PartitionKeys"]:
                name = par["Name"]
                if name in columns_comments:
                    par["Comment"] = columns_comments[name]
        self._client_glue.create_table(DatabaseName=database,
                                       TableInput=table_input)
Пример #4
0
def read(
    path,
    header="infer",
    names=None,
    dtype=None,
    sep=",",
    lineterminator="\n",
    quotechar='"',
    quoting=0,
    escapechar=None,
    parse_dates=False,
    infer_datetime_format=False,
    encoding=None,
    file_format="csv",
    region=None,
    key=None,
    secret=None,
    profile=None,
):
    file_format = file_format.lower()
    if file_format not in ["parquet", "csv"]:
        raise UnsupportedFileFormat(file_format)
    session_primitives = SessionPrimitives(
        region=region, key=key, secret=secret, profile=profile
    )
    session = get_session(session_primitives=session_primitives)
    bucket_name, key_path = parse_path(path)
    s3_client = session.client("s3", use_ssl=True)
    buff = BytesIO()
    s3_client.download_fileobj(bucket_name, key_path, buff)
    buff.seek(0),
    df = None
    if file_format == "csv":
        df = pandas.read_csv(
            buff,
            header=header,
            names=names,
            sep=sep,
            quotechar=quotechar,
            quoting=quoting,
            escapechar=escapechar,
            parse_dates=parse_dates,
            infer_datetime_format=infer_datetime_format,
            lineterminator=lineterminator,
            dtype=dtype,
            encoding=encoding,
        )
    buff.close()
    return df
Пример #5
0
    def _data_to_s3_object_writer(dataframe,
                                  path,
                                  preserve_index,
                                  compression,
                                  session_primitives,
                                  file_format,
                                  cast_columns=None,
                                  extra_args=None,
                                  isolated_dataframe=False):
        fs = s3.get_fs(session_primitives=session_primitives)
        fs = pyarrow.filesystem._ensure_filesystem(fs)
        s3.mkdir_if_not_exists(fs, path)

        if compression is None:
            compression_end = ""
        elif compression == "snappy":
            compression_end = ".snappy"
        elif compression == "gzip":
            compression_end = ".gz"
        else:
            raise InvalidCompression(compression)

        guid = pyarrow.compat.guid()
        if file_format == "parquet":
            outfile = f"{guid}.parquet{compression_end}"
        elif file_format == "csv":
            outfile = f"{guid}.csv{compression_end}"
        else:
            raise UnsupportedFileFormat(file_format)
        object_path = "/".join([path, outfile])
        if file_format == "parquet":
            Pandas.write_parquet_dataframe(
                dataframe=dataframe,
                path=object_path,
                preserve_index=preserve_index,
                compression=compression,
                fs=fs,
                cast_columns=cast_columns,
                isolated_dataframe=isolated_dataframe)
        elif file_format == "csv":
            Pandas.write_csv_dataframe(dataframe=dataframe,
                                       path=object_path,
                                       preserve_index=preserve_index,
                                       compression=compression,
                                       fs=fs,
                                       extra_args=extra_args)
        return object_path
Пример #6
0
 def create_table(self,
                  database,
                  table,
                  schema,
                  path,
                  file_format,
                  partition_cols=None):
     if file_format == "parquet":
         table_input = Glue.parquet_table_definition(
             table, partition_cols, schema, path)
     elif file_format == "csv":
         table_input = Glue.csv_table_definition(table, partition_cols,
                                                 schema, path)
     else:
         raise UnsupportedFileFormat(file_format)
     self._client_glue.create_table(DatabaseName=database,
                                    TableInput=table_input)
Пример #7
0
def _write_data(
    df,
    path,
    session_primitives,
    partition_cols=None,
    preserve_index=True,
    file_format="parquet",
    mode="append",
    num_procs=None,
    num_files=2,
):
    """
    Write the parquet files to s3
    """
    if not num_procs:
        num_procs = mp.cpu_count()
    if path[-1] == "/":
        path = path[:-1]
    file_format = file_format.lower()
    if file_format not in ["parquet", "csv"]:
        raise UnsupportedFileFormat(file_format)
    partition_paths = None

    if partition_cols is not None and len(partition_cols) > 0:
        partition_paths = write_dataset_manager(
            df=df,
            path=path,
            partition_cols=partition_cols,
            session_primitives=session_primitives,
            preserve_index=preserve_index,
            file_format=file_format,
            mode=mode,
            num_procs=num_procs,
            num_files=num_files,
        )
    else:
        write_file_manager(
            df=df,
            path=path,
            preserve_index=preserve_index,
            session_primitives=session_primitives,
            file_format=file_format,
            num_procs=num_procs,
        )

    return partition_paths
Пример #8
0
 def create_table(self,
                  database,
                  table,
                  schema,
                  path,
                  file_format,
                  partition_cols=None):
     client = self._session.boto3_session.client(
         service_name="glue", config=self._session.botocore_config)
     if file_format == "parquet":
         table_input = Glue.parquet_table_definition(
             table, partition_cols, schema, path)
     elif file_format == "csv":
         table_input = Glue.csv_table_definition(table, partition_cols,
                                                 schema, path)
     else:
         raise UnsupportedFileFormat(file_format)
     client.create_table(DatabaseName=database, TableInput=table_input)
Пример #9
0
 def add_partitions(self, database, table, partition_paths, file_format):
     if not partition_paths:
         return None
     partitions = list()
     for partition in partition_paths:
         if file_format == "parquet":
             partition_def = Glue.parquet_partition_definition(partition)
         elif file_format == "csv":
             partition_def = Glue.csv_partition_definition(partition)
         else:
             raise UnsupportedFileFormat(file_format)
         partitions.append(partition_def)
     pages_num = int(ceil(len(partitions) / 100.0))
     for _ in range(pages_num):
         page = partitions[:100]
         del partitions[:100]
         self._client_glue.batch_create_partition(DatabaseName=database,
                                                  TableName=table,
                                                  PartitionInputList=page)
Пример #10
0
    def create_glue_table(self,
                          database,
                          path,
                          dataframe,
                          file_format,
                          compression,
                          table=None,
                          serde=None,
                          sep=",",
                          partition_by=None,
                          load_partitions=True,
                          replace_if_exists=True,
                          description: Optional[str] = None,
                          parameters: Optional[Dict[str, str]] = None,
                          columns_comments: Optional[Dict[str, str]] = None):
        """
        Create a Glue metadata table pointing for some dataset stored on AWS S3.

        :param dataframe: PySpark Dataframe
        :param file_format: File format (E.g. "parquet", "csv")
        :param partition_by: Columns used for partitioning
        :param path: AWS S3 path
        :param compression: Compression (e.g. gzip, snappy, lzo, etc)
        :param sep: Separator token for CSV formats (e.g. ",", ";", "|")
        :param serde: Serializer/Deserializer (e.g. "OpenCSVSerDe", "LazySimpleSerDe")
        :param database: Glue database name
        :param table: Glue table name. If not passed, extracted from the path
        :param load_partitions: Load partitions after the table creation
        :param replace_if_exists: Drop table and recreates that if already exists
        :param description: Table description
        :param parameters: Key/value pairs to tag the table (Optional[Dict[str, str]])
        :param columns_comments: Columns names and the related comments (Optional[Dict[str, str]])
        :return: None
        """
        file_format = file_format.lower()
        if file_format not in ["parquet", "csv"]:
            raise UnsupportedFileFormat(file_format)
        table = table if table else self._session.glue.parse_table_name(path)
        table = table.lower().replace(".", "_")
        logger.debug(f"table: {table}")
        full_schema = dataframe.dtypes
        if partition_by is None:
            partition_by = []
        schema = [x for x in full_schema if x[0] not in partition_by]
        partitions_schema_tmp = {
            x[0]: x[1]
            for x in full_schema if x[0] in partition_by
        }
        partitions_schema = [(x, partitions_schema_tmp[x])
                             for x in partition_by]
        logger.debug(f"schema: {schema}")
        logger.debug(f"partitions_schema: {partitions_schema}")
        if replace_if_exists is not None:
            self._session.glue.delete_table_if_exists(database=database,
                                                      table=table)
        extra_args = {}
        if file_format == "csv":
            extra_args["sep"] = sep
            if serde is None:
                serde = "OpenCSVSerDe"
            extra_args["serde"] = serde
        self._session.glue.create_table(
            database=database,
            table=table,
            schema=schema,
            partition_cols_schema=partitions_schema,
            path=path,
            file_format=file_format,
            compression=compression,
            extra_args=extra_args,
            description=description,
            parameters=parameters,
            columns_comments=columns_comments)
        if load_partitions:
            self._session.athena.repair_table(database=database, table=table)
Пример #11
0
    def to_s3(self,
              dataframe,
              path,
              file_format,
              database=None,
              table=None,
              partition_cols=None,
              preserve_index=True,
              mode="append",
              compression=None,
              procs_cpu_bound=None,
              procs_io_bound=None,
              cast_columns=None,
              extra_args=None,
              inplace=True):
        """
        Write a Pandas Dataframe on S3
        Optionally writes metadata on AWS Glue.

        :param dataframe: Pandas Dataframe
        :param path: AWS S3 path (E.g. s3://bucket-name/folder_name/
        :param file_format: "csv" or "parquet"
        :param database: AWS Glue Database name
        :param table: AWS Glue table name
        :param partition_cols: List of columns names that will be partitions on S3
        :param preserve_index: Should preserve index on S3?
        :param mode: "append", "overwrite", "overwrite_partitions"
        :param compression: None, gzip, snappy, etc
        :param procs_cpu_bound: Number of cores used for CPU bound tasks
        :param procs_io_bound: Number of cores used for I/O bound tasks
        :param cast_columns: Dictionary of columns names and Athena/Glue types to be casted. (E.g. {"col name": "bigint", "col2 name": "int"}) (Only for "parquet" file_format)
        :param extra_args: Extra arguments specific for each file formats (E.g. "sep" for CSV)
        :param inplace: True is cheapest (CPU and Memory) but False leaves your DataFrame intact
        :return: List of objects written on S3
        """
        if not partition_cols:
            partition_cols = []
        if not cast_columns:
            cast_columns = {}
        dataframe = Pandas.normalize_columns_names_athena(dataframe, inplace=inplace)
        cast_columns = {Athena.normalize_column_name(k): v for k, v in cast_columns.items()}
        logger.debug(f"cast_columns: {cast_columns}")
        partition_cols = [Athena.normalize_column_name(x) for x in partition_cols]
        logger.debug(f"partition_cols: {partition_cols}")
        dataframe = Pandas.drop_duplicated_columns(dataframe=dataframe, inplace=inplace)
        if compression is not None:
            compression = compression.lower()
        file_format = file_format.lower()
        if file_format == "csv":
            if compression not in Pandas.VALID_CSV_COMPRESSIONS:
                raise InvalidCompression(
                    f"{compression} isn't a valid CSV compression. Try: {Pandas.VALID_CSV_COMPRESSIONS}")
        elif file_format == "parquet":
            if compression not in Pandas.VALID_PARQUET_COMPRESSIONS:
                raise InvalidCompression(
                    f"{compression} isn't a valid PARQUET compression. Try: {Pandas.VALID_PARQUET_COMPRESSIONS}")
        else:
            raise UnsupportedFileFormat(file_format)
        if dataframe.empty:
            raise EmptyDataframe()
        if ((mode == "overwrite") or ((mode == "overwrite_partitions") and  # noqa
                                      (not partition_cols))):
            self._session.s3.delete_objects(path=path)
        elif mode not in ["overwrite_partitions", "append"]:
            raise UnsupportedWriteMode(mode)
        objects_paths = self.data_to_s3(dataframe=dataframe,
                                        path=path,
                                        partition_cols=partition_cols,
                                        preserve_index=preserve_index,
                                        file_format=file_format,
                                        mode=mode,
                                        compression=compression,
                                        procs_cpu_bound=procs_cpu_bound,
                                        procs_io_bound=procs_io_bound,
                                        cast_columns=cast_columns,
                                        extra_args=extra_args)
        if database:
            self._session.glue.metadata_to_glue(dataframe=dataframe,
                                                path=path,
                                                objects_paths=objects_paths,
                                                database=database,
                                                table=table,
                                                partition_cols=partition_cols,
                                                preserve_index=preserve_index,
                                                file_format=file_format,
                                                mode=mode,
                                                compression=compression,
                                                cast_columns=cast_columns,
                                                extra_args=extra_args)
        return objects_paths
Пример #12
0
    def to_s3(self,
              dataframe,
              path,
              file_format,
              database=None,
              table=None,
              partition_cols=None,
              preserve_index=True,
              mode="append",
              compression=None,
              procs_cpu_bound=None,
              procs_io_bound=None,
              cast_columns=None,
              extra_args=None):
        """
        Write a Pandas Dataframe on S3
        Optionally writes metadata on AWS Glue.

        :param dataframe: Pandas Dataframe
        :param path: AWS S3 path (E.g. s3://bucket-name/folder_name/
        :param file_format: "csv" or "parquet"
        :param database: AWS Glue Database name
        :param table: AWS Glue table name
        :param partition_cols: List of columns names that will be partitions on S3
        :param preserve_index: Should preserve index on S3?
        :param mode: "append", "overwrite", "overwrite_partitions"
        :param compression: None, gzip, snappy, etc
        :param procs_cpu_bound: Number of cores used for CPU bound tasks
        :param procs_io_bound: Number of cores used for I/O bound tasks
        :param cast_columns: Dictionary of columns indexes and Arrow types to be casted. (E.g. {2: "int64", 5: "int32"}) (Only for "parquet" file_format)
        :param extra_args: Extra arguments specific for each file formats (E.g. "sep" for CSV)
        :return: List of objects written on S3
        """
        if compression is not None:
            compression = compression.lower()
        file_format = file_format.lower()
        if file_format not in ["parquet", "csv"]:
            raise UnsupportedFileFormat(file_format)
        if file_format == "csv":
            if compression not in Pandas.VALID_CSV_COMPRESSIONS:
                raise InvalidCompression(
                    f"{compression} isn't a valid CSV compression. Try: {Pandas.VALID_CSV_COMPRESSIONS}"
                )
        elif file_format == "parquet":
            if compression not in Pandas.VALID_PARQUET_COMPRESSIONS:
                raise InvalidCompression(
                    f"{compression} isn't a valid PARQUET compression. Try: {Pandas.VALID_PARQUET_COMPRESSIONS}"
                )
        if dataframe.empty:
            raise EmptyDataframe()
        if not partition_cols:
            partition_cols = []
        if mode == "overwrite" or (mode == "overwrite_partitions"
                                   and not partition_cols):
            self._session.s3.delete_objects(path=path)
        elif mode not in ["overwrite_partitions", "append"]:
            raise UnsupportedWriteMode(mode)
        objects_paths = self.data_to_s3(dataframe=dataframe,
                                        path=path,
                                        partition_cols=partition_cols,
                                        preserve_index=preserve_index,
                                        file_format=file_format,
                                        mode=mode,
                                        compression=compression,
                                        procs_cpu_bound=procs_cpu_bound,
                                        procs_io_bound=procs_io_bound,
                                        cast_columns=cast_columns,
                                        extra_args=extra_args)
        if database:
            self._session.glue.metadata_to_glue(dataframe=dataframe,
                                                path=path,
                                                objects_paths=objects_paths,
                                                database=database,
                                                table=table,
                                                partition_cols=partition_cols,
                                                preserve_index=preserve_index,
                                                file_format=file_format,
                                                mode=mode,
                                                compression=compression,
                                                cast_columns=cast_columns,
                                                extra_args=extra_args)
        return objects_paths
Пример #13
0
 def data_to_s3(
     self,
     dataframe,
     path,
     file_format,
     partition_cols=None,
     preserve_index=True,
     mode="append",
     procs_cpu_bound=None,
     procs_io_bound=None,
 ):
     if not procs_cpu_bound:
         procs_cpu_bound = self._session.procs_cpu_bound
     if not procs_io_bound:
         procs_io_bound = self._session.procs_io_bound
     logger.debug(f"procs_cpu_bound: {procs_cpu_bound}")
     logger.debug(f"procs_io_bound: {procs_io_bound}")
     if path[-1] == "/":
         path = path[:-1]
     file_format = file_format.lower()
     if file_format not in ["parquet", "csv"]:
         raise UnsupportedFileFormat(file_format)
     objects_paths = []
     if procs_cpu_bound > 1:
         bounders = _get_bounders(dataframe=dataframe,
                                  num_partitions=procs_cpu_bound)
         procs = []
         receive_pipes = []
         for bounder in bounders:
             receive_pipe, send_pipe = mp.Pipe()
             proc = mp.Process(
                 target=self._data_to_s3_dataset_writer_remote,
                 args=(
                     send_pipe,
                     dataframe.iloc[bounder[0]:bounder[1], :],
                     path,
                     partition_cols,
                     preserve_index,
                     self._session.primitives,
                     file_format,
                 ),
             )
             proc.daemon = False
             proc.start()
             procs.append(proc)
             receive_pipes.append(receive_pipe)
         for i in range(len(procs)):
             objects_paths += receive_pipes[i].recv()
             procs[i].join()
             receive_pipes[i].close()
     else:
         objects_paths += self._data_to_s3_dataset_writer(
             dataframe=dataframe,
             path=path,
             partition_cols=partition_cols,
             preserve_index=preserve_index,
             session_primitives=self._session.primitives,
             file_format=file_format,
         )
     if mode == "overwrite_partitions" and partition_cols:
         if procs_io_bound > procs_cpu_bound:
             num_procs = floor(
                 float(procs_io_bound) / float(procs_cpu_bound))
         else:
             num_procs = 1
         logger.debug(
             f"num_procs for delete_not_listed_objects: {num_procs}")
         self._session.s3.delete_not_listed_objects(
             objects_paths=objects_paths, procs_io_bound=num_procs)
     return objects_paths