def convert_via_pyarrow(sdf:DataFrame ) -> pd.DataFrame: """Convert a Spark Dataframe to a pandas Dataframe using PyArrow shared memory blocks between Spark and Pandas backends. Args: sdf:DataFrame """ require_minimum_pandas_version() timezone = sdf.sql_ctx._conf.sessionLocalTimeZone() # Rename columns to avoid duplicated column names. tmp_column_names = ['col_{}'.format(i) for i in range(len(sdf.columns))] batches = sdf.toDF(*tmp_column_names)._collect_as_arrow() if len(batches) > 0: table = pyarrow.Table.from_batches(batches) # Pandas DataFrame created from PyArrow uses datetime64[ns] for date type # values, but we should use datetime.date to match the behavior with when # Arrow optimization is disabled. pdf = table.to_pandas(date_as_object=True) # Rename back to the original column names. pdf.columns = sdf.columns for field in sdf.schema: if isinstance(field.dataType, TimestampType): pdf[field.name] = \ _check_series_localize_timestamps(pdf[field.name], timezone) elif isinstance(field.dataType, MapType): pdf[field.name] = \ _convert_map_items_to_dict(pdf[field.name]) return pdf else:return pd.DataFrame.from_records([], columns=sdf.columns)
def rename_columns(data: DataFrame, new_column_names: []) -> DataFrame: """ rename dataframe columns """ data = data.toDF(*new_column_names) return data