def test_nested(spark_session): # data = [[dict(a=1, b=[3, 4], d=1.0)], [json.dumps(dict(b=[30, "40"]))]] # df = SparkDataFrame(data, "a:{a:str,b:[int]}") # a = df.as_array(type_safe=True) # assert [[dict(a="1", b=[3, 4])], [dict(a=None, b=[30, 40])]] == a data = [[[10, 20]]] sdf = spark_session.createDataFrame(data, to_spark_schema("a:[int]")) df = SparkDataFrame(sdf) assert data == df.as_array(type_safe=False) assert data == df.as_array(type_safe=True) assert data == list(df.as_array_iterable(type_safe=False)) assert data == list(df.as_array_iterable(type_safe=True)) data = [[dict(b=[30, 40])]] sdf = spark_session.createDataFrame(data, to_spark_schema("a:{a:str,b:[int]}")) df = SparkDataFrame(sdf) a = df.as_array(type_safe=False) assert [[dict(a=None, b=[30, 40])]] == a a = df.as_array(type_safe=True) assert [[dict(a=None, b=[30, 40])]] == a a = list(df.as_array_iterable(type_safe=False)) assert [[dict(a=None, b=[30, 40])]] == a a = list(df.as_array_iterable(type_safe=True)) assert [[dict(a=None, b=[30, 40])]] == a
def to_df( self, df: Any, schema: Any = None, metadata: Any = None ) -> SparkDataFrame: """Convert a data structure to :class:`~fugue_spark.dataframe.SparkDataFrame` :param data: :class:`~fugue.dataframe.dataframe.DataFrame`, :class:`spark:pyspark.sql.DataFrame`, :class:`spark:pyspark.RDD`, pandas DataFrame or list or iterable of arrays :param schema: |SchemaLikeObject| or :class:`spark:pyspark.sql.types.StructType` defaults to None. :param metadata: |ParamsLikeObject|, defaults to None :return: engine compatible dataframe :Notice: * if the input is already :class:`~fugue_spark.dataframe.SparkDataFrame`, it should return itself * For :class:`~spark:pyspark.RDD`, list or iterable of arrays, ``schema`` must be specified * When ``schema`` is not None, a potential type cast may happen to ensure the dataframe's schema. * all other methods in the engine can take arbitrary dataframes and call this method to convert before doing anything """ if isinstance(df, DataFrame): assert_or_throw( schema is None and metadata is None, ValueError("schema and metadata must be None when df is a DataFrame"), ) if isinstance(df, SparkDataFrame): return df if any(pa.types.is_struct(t) for t in df.schema.types): sdf = self.spark_session.createDataFrame( df.as_array(type_safe=True), to_spark_schema(df.schema) ) else: sdf = self.spark_session.createDataFrame( df.as_pandas(), to_spark_schema(df.schema) ) return SparkDataFrame(sdf, df.schema, df.metadata) if isinstance(df, ps.DataFrame): return SparkDataFrame( df, None if schema is None else to_schema(schema), metadata ) if isinstance(df, RDD): assert_arg_not_none(schema, "schema") sdf = self.spark_session.createDataFrame(df, to_spark_schema(schema)) return SparkDataFrame(sdf, to_schema(schema), metadata) if isinstance(df, pd.DataFrame): sdf = self.spark_session.createDataFrame(df) return SparkDataFrame(sdf, schema, metadata) # use arrow dataframe here to handle nulls in int cols adf = ArrowDataFrame(df, to_schema(schema)) sdf = self.spark_session.createDataFrame( adf.as_array(), to_spark_schema(adf.schema) ) return SparkDataFrame(sdf, adf.schema, metadata)
def test_schema_conversion(spark_session): def test(expr): assert to_schema(to_spark_schema(expr)) == expr test("a:int,b:long,c:[int],d:datetime,e:date,f:decimal(3,4),g:str") test("a:{a:[int],b:[str]}") # test("a:[{a:int}]") TODO: this is not supported by spark, should we support? s = to_spark_schema(to_spark_schema("a:int")) assert to_spark_schema(s) is s df = spark_session.createDataFrame([[1]], schema=to_spark_schema("a:int")) assert to_schema(to_spark_schema(df)) == "a:int" assert to_schema(df) == "a:int" assert to_schema(dict(a=str)) == "a:str"
def _load_csv(self, p: List[str], columns: Any = None, **kwargs: Any) -> DataFrame: kw = ParamDict(kwargs) infer_schema = kw.get("infer_schema", False) if infer_schema: kw["inferSchema"] = True if "infer_schema" in kw: del kw["infer_schema"] header = str(kw.get_or_none("header", object)).lower() if "header" in kw: del kw["header"] reader = self._session.read.format("csv") reader.options(**kw) if header == "true": reader.option("header", "true") if columns is None: return SparkDataFrame(reader.load(p)) if isinstance(columns, list): # column names return SparkDataFrame(reader.load(p)[columns]) schema = Schema(columns) return SparkDataFrame(reader.load(p)[schema.names], schema) if header in ["false", "none"]: reader.option("header", "false") if columns is None: raise InvalidOperationError("columns must be set if without header") if isinstance(columns, list): # column names sdf = reader.load(p) inferred = to_schema(sdf) renames = [f"{k} AS {v}" for k, v in zip(inferred.names, columns)] return SparkDataFrame(sdf.selectExpr(*renames)) schema = Schema(columns) sdf = reader.schema(to_spark_schema(schema)).load(p) return SparkDataFrame(sdf, schema) else: raise NotImplementedError(f"{header} is not supported")
def even_repartition(session: SparkSession, df: ps.DataFrame, num: int, cols: List[Any]) -> ps.DataFrame: if num == 1: return _single_repartition(df) if len(cols) == 0: if num == 0: return df rdd = (_zipWithIndex(df.rdd).partitionBy( num, lambda k: k).mapPartitions(_to_rows)) return session.createDataFrame(rdd, df.schema) else: keys = df.select(*cols).distinct() krdd = _zipWithIndex(keys.rdd, True) new_schema = to_spark_schema( to_schema(df.schema).extract(cols) + f"{_PARTITION_DUMMY_KEY}:long") idx = session.createDataFrame(krdd, new_schema) if num <= 0: idx = idx.persist() num = idx.count() idf = (df.alias("df").join(idx.alias("idx"), on=cols, how="inner").select(_PARTITION_DUMMY_KEY, "df.*")) def _to_kv(rows: Iterable[Any]) -> Iterable[Any]: # pragma: no cover for row in rows: yield (row[0], row[1:]) rdd = (idf.rdd.mapPartitions(_to_kv).partitionBy( num, lambda k: k).mapPartitions(_to_rows)) return session.createDataFrame(rdd, df.schema)
def _df(data, schema=None, metadata=None): session = SparkSession.builder.getOrCreate() if schema is not None: pdf = PandasDataFrame(data, to_schema(schema), metadata) return session.createDataFrame(pdf.native, to_spark_schema(schema)) else: return session.createDataFrame(data)
def _map_by_pandas_udf( self, df: DataFrame, map_func: Callable[[PartitionCursor, LocalDataFrame], LocalDataFrame], output_schema: Any, partition_spec: PartitionSpec, metadata: Any = None, on_init: Optional[Callable[[int, DataFrame], Any]] = None, ) -> DataFrame: df = self.to_df(self.repartition(df, partition_spec)) output_schema = Schema(output_schema) input_schema = df.schema on_init_once: Any = ( None if on_init is None else RunOnce( on_init, lambda *args, **kwargs: to_uuid(id(on_init), id(args[0])) ) ) def _udf( dfs: Iterable[pd.DataFrame], ) -> Iterable[pd.DataFrame]: # pragma: no cover def get_dfs() -> Iterable[LocalDataFrame]: for df in dfs: if df.shape[0] > 0: yield PandasDataFrame( df.reset_index(drop=True), input_schema, pandas_df_wrapper=True, ) input_df = LocalDataFrameIterableDataFrame(get_dfs(), input_schema) if input_df.empty: return PandasDataFrame([], output_schema).as_pandas() if on_init_once is not None: on_init_once(0, input_df) cursor = partition_spec.get_cursor(input_schema, 0) cursor.set(input_df.peek_array(), 0, 0) output_df = map_func(cursor, input_df) if isinstance(output_df, LocalDataFrameIterableDataFrame): for res in output_df.native: yield res.as_pandas() else: yield output_df.as_pandas() df = self.to_df(df) sdf = df.native.mapInPandas(_udf, schema=to_spark_schema(output_schema)) return SparkDataFrame(sdf, metadata=metadata)
def df(self, data: Any = None, schema: Any = None, metadata: Any = None) -> SparkDataFrame: session = SparkSession.builder.getOrCreate() if data is None: df = None else: if schema is not None: pdf = PandasDataFrame(data, to_schema(schema), metadata) df = session.createDataFrame(pdf.native, to_spark_schema(schema)) else: try: df = session.createDataFrame(data) except Exception: raise FugueDataFrameInitError("schema error") return SparkDataFrame(df, schema, metadata)
def _group_map_by_pandas_udf( self, df: DataFrame, map_func: Callable[[PartitionCursor, LocalDataFrame], LocalDataFrame], output_schema: Any, partition_spec: PartitionSpec, metadata: Any = None, on_init: Optional[Callable[[int, DataFrame], Any]] = None, ) -> DataFrame: presort = partition_spec.presort presort_keys = list(presort.keys()) presort_asc = list(presort.values()) output_schema = Schema(output_schema) input_schema = df.schema on_init_once: Any = ( None if on_init is None else RunOnce( on_init, lambda *args, **kwargs: to_uuid(id(on_init), id(args[0])) ) ) def _udf(pdf: Any) -> pd.DataFrame: # pragma: no cover if pdf.shape[0] == 0: return PandasDataFrame([], output_schema).as_pandas() if len(presort_keys) > 0: pdf = pdf.sort_values(presort_keys, ascending=presort_asc) input_df = PandasDataFrame( pdf.reset_index(drop=True), input_schema, pandas_df_wrapper=True ) if on_init_once is not None: on_init_once(0, input_df) cursor = partition_spec.get_cursor(input_schema, 0) cursor.set(input_df.peek_array(), 0, 0) output_df = map_func(cursor, input_df) return output_df.as_pandas() df = self.to_df(df) gdf = df.native.groupBy(*partition_spec.partition_by) sdf = gdf.applyInPandas(_udf, schema=to_spark_schema(output_schema)) return SparkDataFrame(sdf, metadata=metadata)
def test(expr): assert to_schema(to_spark_schema(expr)) == expr