示例#1
0
 def _get_redshift_schema(dataframe,
                          dataframe_type: str,
                          preserve_index: bool = False,
                          cast_columns=None) -> List[Tuple[str, str]]:
     if cast_columns is None:
         cast_columns = {}
     schema_built: List[Tuple[str, str]] = []
     if dataframe_type.lower() == "pandas":
         pyarrow_schema = data_types.extract_pyarrow_schema_from_pandas(dataframe=dataframe,
                                                                        preserve_index=preserve_index,
                                                                        indexes_position="right")
         for name, dtype in pyarrow_schema:
             if (cast_columns is not None) and (name in cast_columns.keys()):
                 schema_built.append((name, cast_columns[name]))
             else:
                 redshift_type = data_types.pyarrow2redshift(dtype)
                 schema_built.append((name, redshift_type))
     elif dataframe_type.lower() == "spark":
         for name, dtype in dataframe.dtypes:
             if name in cast_columns.keys():
                 redshift_type = data_types.athena2redshift(cast_columns[name])
             else:
                 redshift_type = data_types.spark2redshift(dtype)
             schema_built.append((name, redshift_type))
     else:
         raise InvalidDataframeType(
             f"{dataframe_type} is not a valid DataFrame type. Please use 'pandas' or 'spark'!")
     return schema_built
示例#2
0
 def _get_redshift_schema(dataframe,
                          dataframe_type,
                          preserve_index=False,
                          cast_columns=None):
     if cast_columns is None:
         cast_columns = {}
     schema_built = []
     if dataframe_type == "pandas":
         pyarrow_schema = data_types.extract_pyarrow_schema_from_pandas(
             dataframe=dataframe,
             preserve_index=preserve_index,
             indexes_position="right")
         for name, dtype in pyarrow_schema:
             if (cast_columns is not None) and (name
                                                in cast_columns.keys()):
                 schema_built.append((name, cast_columns[name]))
             else:
                 redshift_type = data_types.pyarrow2redshift(dtype)
                 schema_built.append((name, redshift_type))
     elif dataframe_type == "spark":
         for name, dtype in dataframe.dtypes:
             if name in cast_columns.keys():
                 redshift_type = data_types.athena2redshift(
                     cast_columns[name])
             else:
                 redshift_type = data_types.spark2redshift(dtype)
             schema_built.append((name, redshift_type))
     else:
         raise InvalidDataframeType(dataframe_type)
     return schema_built
示例#3
0
 def _get_redshift_schema(dataframe,
                          dataframe_type: str,
                          preserve_index: bool = False,
                          cast_columns=None,
                          varchar_default_length: int = 256,
                          varchar_lengths: Optional[Dict[str, int]] = None) -> List[Tuple[str, str]]:
     cast_columns = {} if cast_columns is None else cast_columns
     varchar_lengths = {} if varchar_lengths is None else varchar_lengths
     schema_built: List[Tuple[str, str]] = []
     if dataframe_type.lower() == "pandas":
         ignore_cols = list(cast_columns.keys()) if cast_columns is not None else None
         pyarrow_schema = data_types.extract_pyarrow_schema_from_pandas(dataframe=dataframe,
                                                                        preserve_index=preserve_index,
                                                                        indexes_position="right",
                                                                        ignore_cols=ignore_cols)
         for name, dtype in pyarrow_schema:
             if (cast_columns is not None) and (name in cast_columns.keys()):
                 schema_built.append((name, cast_columns[name]))
             else:
                 varchar_len = varchar_lengths.get(name, varchar_default_length)
                 redshift_type = data_types.pyarrow2redshift(dtype=dtype, varchar_length=varchar_len)
                 schema_built.append((name, redshift_type))
     elif dataframe_type.lower() == "spark":
         logger.debug(f"cast_columns.keys: {cast_columns.keys()}")
         for name, dtype in dataframe.dtypes:
             varchar_len = varchar_lengths.get(name, varchar_default_length)
             if name in cast_columns.keys():
                 redshift_type = data_types.athena2redshift(dtype=cast_columns[name], varchar_length=varchar_len)
             else:
                 redshift_type = data_types.spark2redshift(dtype=dtype, varchar_length=varchar_len)
             schema_built.append((name, redshift_type))
     else:
         raise InvalidDataframeType(
             f"{dataframe_type} is not a valid DataFrame type. Please use 'pandas' or 'spark'!")
     return schema_built