def _arrowtable2df( table: pa.Table, categories: Optional[List[str]], safe: bool, use_threads: bool, dataset: bool, path: str, path_root: Optional[str], ) -> pd.DataFrame: metadata: Dict[str, Any] = {} if table.schema.metadata is not None and b"pandas" in table.schema.metadata: metadata = json.loads(table.schema.metadata[b"pandas"]) df: pd.DataFrame = _apply_partitions( df=table.to_pandas( use_threads=use_threads, split_blocks=True, self_destruct=True, integer_object_nulls=False, date_as_object=True, ignore_metadata=True, strings_to_categorical=False, safe=safe, categories=categories, types_mapper=_data_types.pyarrow2pandas_extension, ), dataset=dataset, path=path, path_root=path_root, ) df = _utils.ensure_df_is_mutable(df=df) if metadata: _logger.debug("metadata: %s", metadata) df = _apply_index(df=df, metadata=metadata) df = _apply_timezone(df=df, metadata=metadata) return df
def _arrowtable2df( table: pa.Table, categories: Optional[List[str]], safe: bool, use_threads: bool, dataset: bool, path: str, path_root: Optional[str], ) -> pd.DataFrame: df: pd.DataFrame = _apply_partitions( df=table.to_pandas( use_threads=use_threads, split_blocks=True, self_destruct=True, integer_object_nulls=False, date_as_object=True, ignore_metadata=True, categories=categories, safe=safe, types_mapper=_data_types.pyarrow2pandas_extension, ), dataset=dataset, path=path, path_root=path_root, ) return _utils.ensure_df_is_mutable(df=df)
def cast_pandas_with_athena_types(df: pd.DataFrame, dtype: Dict[str, str]) -> pd.DataFrame: """Cast columns in a Pandas DataFrame.""" mutability_ensured: bool = False for col, athena_type in dtype.items(): if ( (col in df.columns) and (athena_type.startswith("array") is False) and (athena_type.startswith("struct") is False) and (athena_type.startswith("map") is False) ): desired_type: str = athena2pandas(dtype=athena_type) current_type: str = _normalize_pandas_dtype_name(dtype=str(df[col].dtypes)) if desired_type != current_type: # Needs conversion _logger.debug("current_type: %s -> desired_type: %s", current_type, desired_type) if mutability_ensured is False: df = _utils.ensure_df_is_mutable(df=df) mutability_ensured = True _cast_pandas_column(df=df, col=col, current_type=current_type, desired_type=desired_type) return df
def _resolve_sql_query( query_id: str, categories: Optional[List[str]], safe: bool, map_types: bool, use_threads: bool, boto3_session: boto3.Session, ) -> pd.DataFrame: client_lakeformation: boto3.client = _utils.client( service_name="lakeformation", session=boto3_session) wait_query(query_id=query_id, boto3_session=boto3_session) # The LF Query Engine distributes the load across workers # Retrieve the tokens and their associated work units until NextToken is '' # One Token can span multiple work units # PageSize determines the size of the "Units" array in each call scan_kwargs: Dict[str, Union[str, int]] = { "QueryId": query_id, "PageSize": 10 } next_token: str = "init_token" # Dummy token token_work_units: List[Tuple[str, int]] = [] while next_token: response = client_lakeformation.get_work_units(**scan_kwargs) token_work_units.extend( # [(Token0, WorkUnitId0), (Token0, WorkUnitId1), (Token1, WorkUnitId2) ... ] [ (unit["WorkUnitToken"], unit_id) for unit in response["WorkUnitRanges"] for unit_id in range( unit["WorkUnitIdMin"], unit["WorkUnitIdMax"] + 1) # Max is inclusive ]) next_token = response.get("NextToken", None) scan_kwargs["NextToken"] = next_token tables: List[Table] = [] if use_threads is False: tables = list( _get_work_unit_results( query_id=query_id, token_work_unit=token_work_unit, client_lakeformation=client_lakeformation, ) for token_work_unit in token_work_units) else: cpus: int = _utils.ensure_cpu_count(use_threads=use_threads) with concurrent.futures.ThreadPoolExecutor( max_workers=cpus) as executor: tables = list( executor.map( _get_work_unit_results, itertools.repeat(query_id), token_work_units, itertools.repeat(client_lakeformation), )) table = concat_tables(tables) args = { "use_threads": use_threads, "split_blocks": True, "self_destruct": True, "integer_object_nulls": False, "date_as_object": True, "ignore_metadata": True, "strings_to_categorical": False, "categories": categories, "safe": safe, "types_mapper": _data_types.pyarrow2pandas_extension if map_types else None, } return _utils.ensure_df_is_mutable(df=table.to_pandas(**args))