示例#1
0
def spark2redshift(dtype: str, varchar_length: int = 256) -> str:
    """Pyspark to Redshift conversion."""
    dtype = dtype.lower()
    if dtype == "smallint":
        return "SMALLINT"
    elif dtype == "int":
        return "INT"
    elif dtype == "bigint":
        return "BIGINT"
    elif dtype == "float":
        return "FLOAT4"
    elif dtype == "double":
        return "FLOAT8"
    elif dtype in ("bool", "boolean"):
        return "BOOLEAN"
    elif dtype == "timestamp":
        return "TIMESTAMP"
    elif dtype == "date":
        return "DATE"
    elif dtype == "string":
        return f"VARCHAR({varchar_length})"
    elif dtype.startswith("decimal"):
        return dtype.replace(" ", "").upper()
    else:
        raise UnsupportedType("Unsupported Spark type: " + dtype)
示例#2
0
def athena2pyarrow(dtype: str) -> str:
    """Athena to PyArrow conversion."""
    dtype = dtype.lower()
    if dtype == "tinyint":
        return "int8"
    if dtype == "smallint":
        return "int16"
    elif dtype in ("int", "integer"):
        return "int32"
    elif dtype == "bigint":
        return "int64"
    elif dtype == "float":
        return "float32"
    elif dtype == "double":
        return "float64"
    elif dtype in ("boolean", "bool"):
        return "bool"
    elif dtype in ("string", "char", "varchar", "array", "row", "map"):
        return "string"
    elif dtype == "timestamp":
        return "timestamp[ns]"
    elif dtype == "date":
        return "date32"
    else:
        raise UnsupportedType(f"Unsupported Athena type: {dtype}")
示例#3
0
def pyarrow2mysql(dtype: pa.types, varchar_length: int = 256) -> str:
    """Pyarrow to MySQL conversion."""
    dtype_str = str(dtype).lower()
    if dtype_str == "int16":
        return "SMALLINT"
    elif dtype_str == "int32":
        return "INT"
    elif dtype_str == "int64":
        return "BIGINT"
    elif dtype_str == "float":
        return "FLOAT"
    elif dtype_str == "double":
        return "DOUBLE"
    elif dtype_str == "bool":
        return "BOOLEAN"
    elif dtype_str == "string":
        return f"VARCHAR({varchar_length})"
    elif dtype_str.startswith("timestamp"):
        return "TIMESTAMP"
    elif dtype_str.startswith("date"):
        return "DATE"
    elif dtype_str.startswith("decimal"):
        return dtype_str.replace(" ", "").upper()
    else:
        raise UnsupportedType(f"Unsupported Pyarrow type: {dtype}")
示例#4
0
def pyarrow2athena(dtype: pa.types) -> str:
    dtype_str = str(dtype).lower()
    if dtype_str == "int8":
        return "tinyint"
    elif dtype_str == "int16":
        return "smallint"
    elif dtype_str == "int32":
        return "int"
    elif dtype_str == "int64":
        return "bigint"
    elif dtype_str == "float":
        return "float"
    elif dtype_str == "double":
        return "double"
    elif dtype_str == "bool":
        return "boolean"
    elif dtype_str == "string":
        return "string"
    elif dtype_str.startswith("timestamp"):
        return "timestamp"
    elif dtype_str.startswith("date"):
        return "date"
    elif dtype_str.startswith("decimal"):
        return dtype_str.replace(" ", "")
    elif dtype_str.startswith("list"):
        return f"array<{pyarrow2athena(dtype.value_type)}>"
    elif dtype_str == "null":
        raise UndetectedType("We can't infer the data type from an entire null object column")
    else:
        raise UnsupportedType(f"Unsupported Pyarrow type: {dtype}")
示例#5
0
def pyarrow2athena(dtype):
    dtype_str = str(dtype).lower()
    if dtype_str == "int8":
        return "tinyint"
    elif dtype_str == "int16":
        return "smallint"
    elif dtype_str == "int32":
        return "int"
    elif dtype_str == "int64":
        return "bigint"
    elif dtype_str == "float":
        return "float"
    elif dtype_str == "double":
        return "double"
    elif dtype_str == "bool":
        return "boolean"
    elif dtype_str == "string":
        return "string"
    elif dtype_str.startswith("timestamp"):
        return "timestamp"
    elif dtype_str.startswith("date"):
        return "date"
    elif dtype_str.startswith("list"):
        return f"array<{pyarrow2athena(dtype.value_type)}>"
    else:
        raise UnsupportedType(f"Unsupported Pyarrow type: {dtype}")
示例#6
0
 def _type_athena2pandas(dtype):
     dtype = dtype.lower()
     if dtype in ["int", "integer", "bigint", "smallint", "tinyint"]:
         return "Int64"
     elif dtype in ["float", "double", "real"]:
         return "float64"
     elif dtype == "boolean":
         return "bool"
     elif dtype in ["string", "char", "varchar", "array", "row", "map"]:
         return "object"
     elif dtype in ["timestamp", "date"]:
         return "datetime64"
     else:
         raise UnsupportedType(f"Unsupported Athena type: {dtype}")
示例#7
0
    def _build_schema(
        dataframe,
        partition_cols: Optional[List[str]],
        preserve_index: bool,
        indexes_position: str,
        cast_columns: Optional[Dict[str, str]] = None
    ) -> Tuple[List[Tuple[str, str]], List[Tuple[str, str]]]:
        if cast_columns is None:
            cast_columns = {}
        logger.debug(f"dataframe.dtypes:\n{dataframe.dtypes}")
        if partition_cols is None:
            partition_cols = []

        pyarrow_schema: List[Tuple[
            str, Any]] = data_types.extract_pyarrow_schema_from_pandas(
                dataframe=dataframe,
                preserve_index=preserve_index,
                indexes_position=indexes_position)

        schema_built: List[Tuple[str, str]] = []
        partition_cols_types: Dict[str, str] = {}
        for name, dtype in pyarrow_schema:
            if (cast_columns is not None) and (name in cast_columns.keys()):
                if name in partition_cols:
                    partition_cols_types[name] = cast_columns[name]
                else:
                    schema_built.append((name, cast_columns[name]))
            else:
                try:
                    athena_type = data_types.pyarrow2athena(dtype)
                except UndetectedType:
                    raise UndetectedType(
                        f"We can't infer the data type from an entire null object column ({name}). "
                        f"Please consider pass the type of this column explicitly using the cast "
                        f"columns argument")
                except UnsupportedType:
                    raise UnsupportedType(
                        f"Unsupported Pyarrow type for column {name}: {dtype}")
                if name in partition_cols:
                    partition_cols_types[name] = athena_type
                else:
                    schema_built.append((name, athena_type))

        partition_cols_schema_built: List = [(name, partition_cols_types[name])
                                             for name in partition_cols]

        logger.debug(f"schema_built:\n{schema_built}")
        logger.debug(
            f"partition_cols_schema_built:\n{partition_cols_schema_built}")
        return schema_built, partition_cols_schema_built
示例#8
0
 def _type_spark2redshift(dtype):
     dtype = dtype.lower()
     if dtype in ["smallint", "int", "bigint"]:
         return "BIGINT"
     elif dtype == "float":
         return "FLOAT4"
     elif dtype == "double":
         return "FLOAT8"
     elif dtype == "bool":
         return "BOOLEAN"
     elif dtype == "timestamp":
         return "TIMESTAMP"
     elif dtype == "string":
         return "VARCHAR(256)"
     else:
         raise UnsupportedType("Unsupported Spark type: " + dtype)
示例#9
0
 def _type_spark2redshift(dtype):
     dtype = dtype.lower()
     if dtype == "int":
         return "INTEGER"
     elif dtype == "long":
         return "BIGINT"
     elif dtype == "float":
         return "FLOAT8"
     elif dtype == "bool":
         return "BOOLEAN"
     elif dtype == "string":
         return "VARCHAR(256)"
     elif dtype[:10] == "datetime.datetime":
         return "TIMESTAMP"
     else:
         raise UnsupportedType("Unsupported Spark type: " + dtype)
示例#10
0
 def type_python2athena(python_type):
     python_type = str(python_type)
     if python_type == "<class 'int'>":
         return "bigint"
     elif python_type == "<class 'float'>":
         return "double"
     elif python_type == "<class 'boll'>":
         return "boolean"
     elif python_type == "<class 'str'>":
         return "string"
     elif python_type == "<class 'datetime.datetime'>":
         return "timestamp"
     elif python_type == "<class 'datetime.date'>":
         return "date"
     else:
         raise UnsupportedType(f"Unsupported Python type: {python_type}")
示例#11
0
 def type_athena2python(dtype):
     dtype = dtype.lower()
     if dtype in ["int", "integer", "bigint", "smallint", "tinyint"]:
         return int
     elif dtype in ["float", "double", "real"]:
         return float
     elif dtype == "boolean":
         return bool
     elif dtype in ["string", "char", "varchar", "array", "row", "map"]:
         return str
     elif dtype == "timestamp":
         return datetime
     elif dtype == "date":
         return date
     else:
         raise UnsupportedType(f"Unsupported Athena type: {dtype}")
示例#12
0
def python2athena(python_type: type) -> str:
    """Python to Athena conversion."""
    python_type_str: str = str(python_type)
    if python_type_str == "<class 'int'>":
        return "bigint"
    elif python_type_str == "<class 'float'>":
        return "double"
    elif python_type_str == "<class 'boll'>":
        return "boolean"
    elif python_type_str == "<class 'str'>":
        return "string"
    elif python_type_str == "<class 'datetime.datetime'>":
        return "timestamp"
    elif python_type_str == "<class 'datetime.date'>":
        return "date"
    else:
        raise UnsupportedType(f"Unsupported Python type: {python_type_str}")
示例#13
0
def athena2pandas(dtype):
    dtype = dtype.lower()
    if dtype in ["int", "integer", "bigint", "smallint", "tinyint"]:
        return "Int64"
    elif dtype in ["float", "double", "real"]:
        return "float64"
    elif dtype == "boolean":
        return "bool"
    elif dtype in ["string", "char", "varchar"]:
        return "str"
    elif dtype in ["timestamp", "timestamp with time zone"]:
        return "datetime64"
    elif dtype == "date":
        return "date"
    elif dtype == "array":
        return "literal_eval"
    else:
        raise UnsupportedType(f"Unsupported Athena type: {dtype}")
示例#14
0
def _type_pandas2athena(dtype):
    dtype = dtype.lower()
    if dtype == "int32":
        return "int"
    elif dtype == "int64":
        return "bigint"
    elif dtype == "float32":
        return "float"
    elif dtype == "float64":
        return "double"
    elif dtype == "bool":
        return "boolean"
    elif dtype == "object" and isinstance(dtype, string_types):
        return "string"
    elif dtype[:10] == "datetime64":
        return "string"
    else:
        raise UnsupportedType("Unsupported Pandas type: " + dtype)
示例#15
0
def pandas2athena(dtype):
    dtype = dtype.lower()
    if dtype == "int32":
        return "int"
    elif dtype in ["int64", "Int64"]:
        return "bigint"
    elif dtype == "float32":
        return "float"
    elif dtype == "float64":
        return "double"
    elif dtype == "bool":
        return "boolean"
    elif dtype == "object":
        return "string"
    elif dtype.startswith("datetime64"):
        return "timestamp"
    else:
        raise UnsupportedType(f"Unsupported Pandas type: {dtype}")
示例#16
0
def pandas2redshift(dtype):
    dtype = dtype.lower()
    if dtype == "int32":
        return "INTEGER"
    elif dtype == "int64":
        return "BIGINT"
    elif dtype == "float32":
        return "FLOAT4"
    elif dtype == "float64":
        return "FLOAT8"
    elif dtype == "bool":
        return "BOOLEAN"
    elif dtype == "object" and isinstance(dtype, str):
        return "VARCHAR(256)"
    elif dtype[:10] == "datetime64":
        return "TIMESTAMP"
    else:
        raise UnsupportedType("Unsupported Pandas type: " + dtype)
示例#17
0
 def type_pandas2athena(dtype):
     dtype = dtype.lower()
     if dtype == "int32":
         return "int"
     elif dtype in ["int64", "Int64"]:
         return "bigint"
     elif dtype == "float32":
         return "float"
     elif dtype == "float64":
         return "double"
     elif dtype == "bool":
         return "boolean"
     elif dtype == "object" and isinstance(dtype, str):
         return "string"
     elif dtype[:10] == "datetime64":
         return "timestamp"
     else:
         raise UnsupportedType(f"Unsupported Pandas type: {dtype}")
示例#18
0
def athena2python(dtype: str) -> Optional[type]:
    dtype = dtype.lower()
    if dtype in ("int", "integer", "bigint", "smallint", "tinyint"):
        return int
    elif dtype in ("float", "double", "real"):
        return float
    elif dtype == "boolean":
        return bool
    elif dtype in ("string", "char", "varchar", "array", "row", "map"):
        return str
    elif dtype == "timestamp":
        return datetime
    elif dtype == "date":
        return date
    elif dtype == "unknown":
        return None
    elif dtype == "decimal":
        return Decimal
    else:
        raise UnsupportedType(f"Unsupported Athena type: {dtype}")
示例#19
0
def pandas2redshift(dtype: str, varchar_length: int = 256) -> str:
    dtype = dtype.lower()
    if dtype == "int32":
        return "INTEGER"
    elif dtype == "int64":
        return "BIGINT"
    elif dtype == "float32":
        return "FLOAT4"
    elif dtype == "float64":
        return "FLOAT8"
    elif dtype == "bool":
        return "BOOLEAN"
    elif dtype == "string":
        return f"VARCHAR({varchar_length})"
    elif dtype == "object":
        return f"VARCHAR({varchar_length})"
    elif dtype[:10] == "datetime64":
        return "TIMESTAMP"
    else:
        raise UnsupportedType("Unsupported Pandas type: " + dtype)
示例#20
0
def athena2pandas(dtype: str) -> str:
    dtype = dtype.lower()
    if dtype in ("int", "integer", "bigint", "smallint", "tinyint"):
        return "Int64"
    elif dtype in ("float", "double", "real"):
        return "float64"
    elif dtype == "boolean":
        return "bool"
    elif dtype in ("string", "char", "varchar"):
        return "string"
    elif dtype in ("timestamp", "timestamp with time zone"):
        return "datetime64"
    elif dtype == "date":
        return "date"
    elif dtype == "array":
        return "list"
    elif dtype == "decimal":
        return "decimal"
    else:
        raise UnsupportedType(f"Unsupported Athena type: {dtype}")
示例#21
0
def pandas2athena(dtype: str) -> str:
    """Pandas to Aurora conversion."""
    dtype = dtype.lower()
    if dtype == "int32":
        return "int"
    elif dtype in ("int64", "Int64"):
        return "bigint"
    elif dtype == "float32":
        return "float"
    elif dtype == "float64":
        return "double"
    elif dtype == "bool":
        return "boolean"
    elif dtype == "string":
        return "string"
    elif dtype == "object":
        return "string"
    elif dtype.startswith("datetime64"):
        return "timestamp"
    else:
        raise UnsupportedType(f"Unsupported Pandas type: {dtype}")
示例#22
0
def athena2redshift(dtype: str, varchar_length: int = 256) -> str:
    dtype = dtype.lower()
    if dtype == "smallint":
        return "SMALLINT"
    elif dtype in ("int", "integer"):
        return "INTEGER"
    elif dtype == "bigint":
        return "BIGINT"
    elif dtype == "float":
        return "FLOAT4"
    elif dtype == "double":
        return "FLOAT8"
    elif dtype in ("boolean", "bool"):
        return "BOOL"
    elif dtype in ("string", "char", "varchar", "array", "row", "map"):
        return f"VARCHAR({varchar_length})"
    elif dtype == "timestamp":
        return "TIMESTAMP"
    elif dtype == "date":
        return "DATE"
    else:
        raise UnsupportedType(f"Unsupported Athena type: {dtype}")
示例#23
0
def redshift2pyarrow(dtype: str) -> str:
    dtype_str: str = str(dtype)
    if dtype_str in ("SMALLINT", "INT2"):
        return "int16"
    elif dtype_str in ("INTEGER", "INT", "INT4"):
        return "int32"
    elif dtype_str in ("BIGINT", "INT8"):
        return "int64"
    elif dtype_str in ("REAL", "FLOAT4"):
        return "float32"
    elif dtype_str in ("DOUBLE PRECISION", "FLOAT8", "FLOAT"):
        return "float64"
    elif dtype_str in ("BOOLEAN", "BOOL"):
        return "bool"
    elif dtype_str in ("VARCHAR", "CHARACTER VARYING", "NVARCHAR", "TEXT"):
        return "string"
    elif dtype_str == "DATE":
        return "date32"
    elif dtype_str == "TIMESTAMP":
        return "timestamp[ns]"
    else:
        raise UnsupportedType(f"Unsupported Redshift type: {dtype_str}")
示例#24
0
def redshift2athena(dtype: str) -> str:
    dtype_str = str(dtype)
    if dtype_str in ("SMALLINT", "INT2"):
        return "smallint"
    elif dtype_str in ("INTEGER", "INT", "INT4"):
        return "int"
    elif dtype_str in ("BIGINT", "INT8"):
        return "bigint"
    elif dtype_str in ("REAL", "FLOAT4"):
        return "float"
    elif dtype_str in ("DOUBLE PRECISION", "FLOAT8", "FLOAT"):
        return "double"
    elif dtype_str in ("BOOLEAN", "BOOL"):
        return "boolean"
    elif dtype_str in ("VARCHAR", "CHARACTER VARYING", "NVARCHAR", "TEXT"):
        return "string"
    elif dtype_str == "DATE":
        return "date"
    elif dtype_str == "TIMESTAMP":
        return "timestamp"
    else:
        raise UnsupportedType(f"Unsupported Redshift type: {dtype_str}")
示例#25
0
def spark2redshift(dtype: str) -> str:
    dtype = dtype.lower()
    if dtype == "smallint":
        return "SMALLINT"
    elif dtype == "int":
        return "INT"
    elif dtype == "bigint":
        return "BIGINT"
    elif dtype == "float":
        return "FLOAT4"
    elif dtype == "double":
        return "FLOAT8"
    elif dtype == "bool":
        return "BOOLEAN"
    elif dtype == "timestamp":
        return "TIMESTAMP"
    elif dtype == "date":
        return "DATE"
    elif dtype == "string":
        return "VARCHAR(256)"
    else:
        raise UnsupportedType("Unsupported Spark type: " + dtype)
示例#26
0
def pyarrow2redshift(dtype):
    dtype_str = str(dtype).lower()
    if dtype_str == "int16":
        return "SMALLINT"
    elif dtype_str == "int32":
        return "INT"
    elif dtype_str == "int64":
        return "BIGINT"
    elif dtype_str == "float":
        return "FLOAT4"
    elif dtype_str == "double":
        return "FLOAT8"
    elif dtype_str == "bool":
        return "BOOLEAN"
    elif dtype_str == "string":
        return "VARCHAR(256)"
    elif dtype_str.startswith("timestamp"):
        return "TIMESTAMP"
    elif dtype_str.startswith("date"):
        return "DATE"
    else:
        raise UnsupportedType(f"Unsupported Pyarrow type: {dtype}")
示例#27
0
def pyarrow2postgres(dtype: pa.types) -> str:
    dtype_str = str(dtype).lower()
    if dtype_str == "int16":
        return "SMALLINT"
    elif dtype_str == "int32":
        return "INT"
    elif dtype_str == "int64":
        return "BIGINT"
    elif dtype_str == "float":
        return "FLOAT4"
    elif dtype_str == "double":
        return "FLOAT8"
    elif dtype_str == "bool":
        return "BOOLEAN"
    elif dtype_str == "string":
        return "VARCHAR(256)"
    elif dtype_str.startswith("timestamp"):
        return "TIMESTAMP"
    elif dtype_str.startswith("date"):
        return "DATE"
    elif dtype_str.startswith("decimal"):
        return dtype_str.replace(" ", "").upper()
    else:
        raise UnsupportedType(f"Unsupported Pyarrow type: {dtype}")
示例#28
0
def athena2pyarrow(dtype):
    dtype = dtype.lower()
    if dtype == "tinyint":
        return "int8"
    if dtype == "smallint":
        return "int16"
    elif dtype in ["int", "integer"]:
        return "int32"
    elif dtype == "bigint":
        return "int64"
    elif dtype == "float":
        return "float32"
    elif dtype == "double":
        return "float64"
    elif dtype in ["boolean", "bool"]:
        return "bool"
    elif dtype in ["string", "char", "varchar", "array", "row", "map"]:
        return "string"
    elif dtype == "timestamp":
        return "timestamp[ns]"
    elif dtype == "date":
        return "date32"
    else:
        raise UnsupportedType(f"Unsupported Athena type: {dtype}")