def _use_aggregation(self, agg, columns=None): """Compute the result using the aggregation function provided. The aggregation name must also be provided so we can strip of the extra name that Spark SQL adds.""" if not columns: columns = self._columns from pyspark.sql import functions as F aggs = map(lambda column: agg(column).alias(column), self._columns) aggRdd = self._grouped_spark_sql.agg(*aggs) df = Dataframe.from_schema_rdd(aggRdd, self._by) return df
def from_pd_data_frame(self, local_df): """Make a distributed dataframe from a local dataframe. The intend use is for testing. Note: dtypes are re-infered, so they may not match.""" def frame_to_rows(frame): """Convert a Panda's DataFrame into Spark SQL Rows""" # TODO: Convert to row objects directly? return [r.tolist() for r in frame.to_records()] schema = list(local_df.columns) index_names = list(local_df.index.names) index_names = _normalize_index_names(index_names) schema = index_names + schema rows = self.spark_ctx.parallelize(frame_to_rows(local_df)) sp_df = Dataframe.from_schema_rdd( self.sql_ctx.createDataFrame( rows, schema=schema, # Look at all the rows, should be ok since coming from # a local dataset samplingRatio=1)) sp_df._index_names = index_names return sp_df