def _arrow_table_to_pandas( cls, arrow_table: pa.Table, categories, **kwargs ) -> pd.DataFrame: _kwargs = kwargs.get("arrow_to_pandas", {}) _kwargs.update({"use_threads": False, "ignore_metadata": False}) return arrow_table.to_pandas(categories=categories, **_kwargs)
def arrow_to_pandas(cls, table: pa.Table) -> pd.DataFrame: return table.to_pandas( ignore_metadata=True, # noqa date_as_object=False, # noqa timestamp_as_object=False, # noqa types_mapper=cls.__ARROW_TO_PANDAS_TYPE_MAPPING.get)
def _arrowtable2df( table: pa.Table, categories: Optional[List[str]], safe: bool, use_threads: bool, dataset: bool, path: str, path_root: Optional[str], ) -> pd.DataFrame: df: pd.DataFrame = _apply_partitions( df=table.to_pandas( use_threads=use_threads, split_blocks=True, self_destruct=True, integer_object_nulls=False, date_as_object=True, ignore_metadata=True, categories=categories, safe=safe, types_mapper=_data_types.pyarrow2pandas_extension, ), dataset=dataset, path=path, path_root=path_root, ) return _utils.ensure_df_is_mutable(df=df)
def _arrowtable2df( table: pa.Table, categories: Optional[List[str]], safe: bool, use_threads: bool, dataset: bool, path: str, path_root: Optional[str], ) -> pd.DataFrame: metadata: Dict[str, Any] = {} if table.schema.metadata is not None and b"pandas" in table.schema.metadata: metadata = json.loads(table.schema.metadata[b"pandas"]) df: pd.DataFrame = _apply_partitions( df=table.to_pandas( use_threads=use_threads, split_blocks=True, self_destruct=True, integer_object_nulls=False, date_as_object=True, ignore_metadata=True, strings_to_categorical=False, safe=safe, categories=categories, types_mapper=_data_types.pyarrow2pandas_extension, ), dataset=dataset, path=path, path_root=path_root, ) df = _utils.ensure_df_is_mutable(df=df) if metadata: _logger.debug("metadata: %s", metadata) df = _apply_index(df=df, metadata=metadata) df = _apply_timezone(df=df, metadata=metadata) return df
def test_write_pandas(tmp_path: pathlib.Path, sample_data: pa.Table): # When timestamp is converted to Pandas, it gets casted to ns resolution, # but Delta Lake schemas only support us resolution. sample_pandas = sample_data.to_pandas().drop(["timestamp"], axis=1) write_deltalake(str(tmp_path), sample_pandas) delta_table = DeltaTable(str(tmp_path)) df = delta_table.to_pandas() assert_frame_equal(df, sample_pandas)
def _table2df(table: pa.Table, categories: List[str] = None, use_threads: bool = True) -> pd.DataFrame: return table.to_pandas( use_threads=use_threads, split_blocks=True, self_destruct=True, integer_object_nulls=False, date_as_object=True, ignore_metadata=True, categories=categories, types_mapper=_data_types.pyarrow2pandas_extension, )
def _write_partitioned_table_from_source( column_names: List[str], table: pa.Table, feature_table_date_partition_column: str, feature_table_timestamp_column: str, ) -> str: """ Partitions dataset by date based on timestamp_column. Assumes date_partition_column is in date format if provided. Args: column_names: Column names in provided ingestion source table: PyArrow table of Dataset feature_table_date_partition_column: Date-partition column of FeatureTable feature_table_timestamp_column: Timestamp column of FeatureTable Returns: str: Root directory which contains date partitioned files. """ dir_path = tempfile.mkdtemp() # Case: date_partition_column is provided and dataset does not contain it if feature_table_date_partition_column not in column_names: df = table.to_pandas() df[feature_table_date_partition_column] = df[ feature_table_timestamp_column ].dt.date table = pa.Table.from_pandas(df) pq.write_to_dataset( table=table, root_path=dir_path, partition_cols=[feature_table_date_partition_column], ) # Remove table from memory del table return dir_path
def _handle_table_dataframe( table: pa.Table, mappings: Optional[Dict], raise_on_empty: bool = True, sort_columns: Optional[List] = None, as_type: Optional[Dict] = None, ): df = table.to_pandas().drop_duplicates() for col in mappings: df.loc[:, col] = df[col].map(mappings[col]) if df.empty and raise_on_empty: local_vars = dict(locals()) kw = [ f"{k}={local_vars[k]}" for k in ("filter_expr", "instrument_ids", "start", "end") ] raise ValueError(f"Data empty for {kw}") if sort_columns: df = df.sort_values(sort_columns) if as_type: df = df.astype(as_type) return df
def extract_column(self, pa_table: pa.Table) -> np.ndarray: series = pa_table.to_pandas( types_mapper=pandas_types_mapper)[pa_table.column_names[0]] return self._series_to_numpy(series)
def convert_table_to_df(table: pa.Table) -> pd.DataFrame: return table.to_pandas(integer_object_nulls=True)
def print_expected(expected: Table): print('==================== EXPECTED ========================') print(expected.to_pandas())
def print_actual(actual: Table): print('==================== ACTUAL ==========================') print(actual.to_pandas())
def extract_batch(self, pa_table: pa.Table) -> pd.DataFrame: return pa_table.to_pandas(types_mapper=pandas_types_mapper)
def extract_column(self, pa_table: pa.Table) -> pd.Series: return pa_table.to_pandas( types_mapper=pandas_types_mapper)[pa_table.column_names[0]]
def extract_batch(self, pa_table: pa.Table) -> dict: df = pa_table.to_pandas(types_mapper=pandas_types_mapper) return {k: self._series_to_numpy(v) for k, v in df.items()}
def m_o(engine: NativeExecutionEngine, df: pa.Table) -> None: assert 1 == df.to_pandas().shape[0]
def convert_table_to_df(table: pa.Table) -> pd.DataFrame: try: return table.to_pandas(integer_object_nulls=True) except pa.lib.ArrowInvalid: return table.to_pandas(integer_object_nulls=True, timestamp_as_object=True)
def __arrow_to_pandas(table: pyarrow.Table) -> pd.DataFrame: return table.to_pandas( date_as_object=False, deduplicate_objects=True, ignore_metadata=True) # TODO ensure dictionaries stay dictionaries