def _get_redshift_schema(dataframe, dataframe_type: str, preserve_index: bool = False, cast_columns=None) -> List[Tuple[str, str]]: if cast_columns is None: cast_columns = {} schema_built: List[Tuple[str, str]] = [] if dataframe_type.lower() == "pandas": pyarrow_schema = data_types.extract_pyarrow_schema_from_pandas(dataframe=dataframe, preserve_index=preserve_index, indexes_position="right") for name, dtype in pyarrow_schema: if (cast_columns is not None) and (name in cast_columns.keys()): schema_built.append((name, cast_columns[name])) else: redshift_type = data_types.pyarrow2redshift(dtype) schema_built.append((name, redshift_type)) elif dataframe_type.lower() == "spark": for name, dtype in dataframe.dtypes: if name in cast_columns.keys(): redshift_type = data_types.athena2redshift(cast_columns[name]) else: redshift_type = data_types.spark2redshift(dtype) schema_built.append((name, redshift_type)) else: raise InvalidDataframeType( f"{dataframe_type} is not a valid DataFrame type. Please use 'pandas' or 'spark'!") return schema_built
def _get_redshift_schema(dataframe, dataframe_type, preserve_index=False, cast_columns=None): if cast_columns is None: cast_columns = {} schema_built = [] if dataframe_type == "pandas": pyarrow_schema = data_types.extract_pyarrow_schema_from_pandas( dataframe=dataframe, preserve_index=preserve_index, indexes_position="right") for name, dtype in pyarrow_schema: if (cast_columns is not None) and (name in cast_columns.keys()): schema_built.append((name, cast_columns[name])) else: redshift_type = data_types.pyarrow2redshift(dtype) schema_built.append((name, redshift_type)) elif dataframe_type == "spark": for name, dtype in dataframe.dtypes: if name in cast_columns.keys(): redshift_type = data_types.athena2redshift( cast_columns[name]) else: redshift_type = data_types.spark2redshift(dtype) schema_built.append((name, redshift_type)) else: raise InvalidDataframeType(dataframe_type) return schema_built
def _get_redshift_schema(dataframe, dataframe_type: str, preserve_index: bool = False, cast_columns=None, varchar_default_length: int = 256, varchar_lengths: Optional[Dict[str, int]] = None) -> List[Tuple[str, str]]: cast_columns = {} if cast_columns is None else cast_columns varchar_lengths = {} if varchar_lengths is None else varchar_lengths schema_built: List[Tuple[str, str]] = [] if dataframe_type.lower() == "pandas": ignore_cols = list(cast_columns.keys()) if cast_columns is not None else None pyarrow_schema = data_types.extract_pyarrow_schema_from_pandas(dataframe=dataframe, preserve_index=preserve_index, indexes_position="right", ignore_cols=ignore_cols) for name, dtype in pyarrow_schema: if (cast_columns is not None) and (name in cast_columns.keys()): schema_built.append((name, cast_columns[name])) else: varchar_len = varchar_lengths.get(name, varchar_default_length) redshift_type = data_types.pyarrow2redshift(dtype=dtype, varchar_length=varchar_len) schema_built.append((name, redshift_type)) elif dataframe_type.lower() == "spark": logger.debug(f"cast_columns.keys: {cast_columns.keys()}") for name, dtype in dataframe.dtypes: varchar_len = varchar_lengths.get(name, varchar_default_length) if name in cast_columns.keys(): redshift_type = data_types.athena2redshift(dtype=cast_columns[name], varchar_length=varchar_len) else: redshift_type = data_types.spark2redshift(dtype=dtype, varchar_length=varchar_len) schema_built.append((name, redshift_type)) else: raise InvalidDataframeType( f"{dataframe_type} is not a valid DataFrame type. Please use 'pandas' or 'spark'!") return schema_built