示例#1
0
def _arrowtable2df(
    table: pa.Table,
    categories: Optional[List[str]],
    safe: bool,
    use_threads: bool,
    dataset: bool,
    path: str,
    path_root: Optional[str],
) -> pd.DataFrame:
    metadata: Dict[str, Any] = {}
    if table.schema.metadata is not None and b"pandas" in table.schema.metadata:
        metadata = json.loads(table.schema.metadata[b"pandas"])
    df: pd.DataFrame = _apply_partitions(
        df=table.to_pandas(
            use_threads=use_threads,
            split_blocks=True,
            self_destruct=True,
            integer_object_nulls=False,
            date_as_object=True,
            ignore_metadata=True,
            strings_to_categorical=False,
            safe=safe,
            categories=categories,
            types_mapper=_data_types.pyarrow2pandas_extension,
        ),
        dataset=dataset,
        path=path,
        path_root=path_root,
    )
    df = _utils.ensure_df_is_mutable(df=df)
    if metadata:
        _logger.debug("metadata: %s", metadata)
        df = _apply_index(df=df, metadata=metadata)
        df = _apply_timezone(df=df, metadata=metadata)
    return df
def _arrowtable2df(
    table: pa.Table,
    categories: Optional[List[str]],
    safe: bool,
    use_threads: bool,
    dataset: bool,
    path: str,
    path_root: Optional[str],
) -> pd.DataFrame:
    df: pd.DataFrame = _apply_partitions(
        df=table.to_pandas(
            use_threads=use_threads,
            split_blocks=True,
            self_destruct=True,
            integer_object_nulls=False,
            date_as_object=True,
            ignore_metadata=True,
            categories=categories,
            safe=safe,
            types_mapper=_data_types.pyarrow2pandas_extension,
        ),
        dataset=dataset,
        path=path,
        path_root=path_root,
    )
    return _utils.ensure_df_is_mutable(df=df)
示例#3
0
def cast_pandas_with_athena_types(df: pd.DataFrame, dtype: Dict[str, str]) -> pd.DataFrame:
    """Cast columns in a Pandas DataFrame."""
    mutability_ensured: bool = False
    for col, athena_type in dtype.items():
        if (
            (col in df.columns)
            and (athena_type.startswith("array") is False)
            and (athena_type.startswith("struct") is False)
            and (athena_type.startswith("map") is False)
        ):
            desired_type: str = athena2pandas(dtype=athena_type)
            current_type: str = _normalize_pandas_dtype_name(dtype=str(df[col].dtypes))
            if desired_type != current_type:  # Needs conversion
                _logger.debug("current_type: %s -> desired_type: %s", current_type, desired_type)
                if mutability_ensured is False:
                    df = _utils.ensure_df_is_mutable(df=df)
                    mutability_ensured = True
                _cast_pandas_column(df=df, col=col, current_type=current_type, desired_type=desired_type)
    return df
示例#4
0
def _resolve_sql_query(
    query_id: str,
    categories: Optional[List[str]],
    safe: bool,
    map_types: bool,
    use_threads: bool,
    boto3_session: boto3.Session,
) -> pd.DataFrame:
    client_lakeformation: boto3.client = _utils.client(
        service_name="lakeformation", session=boto3_session)

    wait_query(query_id=query_id, boto3_session=boto3_session)

    # The LF Query Engine distributes the load across workers
    # Retrieve the tokens and their associated work units until NextToken is ''
    # One Token can span multiple work units
    # PageSize determines the size of the "Units" array in each call
    scan_kwargs: Dict[str, Union[str, int]] = {
        "QueryId": query_id,
        "PageSize": 10
    }
    next_token: str = "init_token"  # Dummy token
    token_work_units: List[Tuple[str, int]] = []
    while next_token:
        response = client_lakeformation.get_work_units(**scan_kwargs)
        token_work_units.extend(  # [(Token0, WorkUnitId0), (Token0, WorkUnitId1), (Token1, WorkUnitId2) ... ]
            [
                (unit["WorkUnitToken"], unit_id)
                for unit in response["WorkUnitRanges"] for unit_id in range(
                    unit["WorkUnitIdMin"], unit["WorkUnitIdMax"] +
                    1)  # Max is inclusive
            ])
        next_token = response.get("NextToken", None)
        scan_kwargs["NextToken"] = next_token

    tables: List[Table] = []
    if use_threads is False:
        tables = list(
            _get_work_unit_results(
                query_id=query_id,
                token_work_unit=token_work_unit,
                client_lakeformation=client_lakeformation,
            ) for token_work_unit in token_work_units)
    else:
        cpus: int = _utils.ensure_cpu_count(use_threads=use_threads)
        with concurrent.futures.ThreadPoolExecutor(
                max_workers=cpus) as executor:
            tables = list(
                executor.map(
                    _get_work_unit_results,
                    itertools.repeat(query_id),
                    token_work_units,
                    itertools.repeat(client_lakeformation),
                ))
    table = concat_tables(tables)
    args = {
        "use_threads":
        use_threads,
        "split_blocks":
        True,
        "self_destruct":
        True,
        "integer_object_nulls":
        False,
        "date_as_object":
        True,
        "ignore_metadata":
        True,
        "strings_to_categorical":
        False,
        "categories":
        categories,
        "safe":
        safe,
        "types_mapper":
        _data_types.pyarrow2pandas_extension if map_types else None,
    }
    return _utils.ensure_df_is_mutable(df=table.to_pandas(**args))