def to_df( self, df: Any, schema: Any = None, metadata: Any = None ) -> SparkDataFrame: """Convert a data structure to :class:`~fugue_spark.dataframe.SparkDataFrame` :param data: :class:`~fugue.dataframe.dataframe.DataFrame`, :class:`spark:pyspark.sql.DataFrame`, :class:`spark:pyspark.RDD`, pandas DataFrame or list or iterable of arrays :param schema: |SchemaLikeObject| or :class:`spark:pyspark.sql.types.StructType` defaults to None. :param metadata: |ParamsLikeObject|, defaults to None :return: engine compatible dataframe :Notice: * if the input is already :class:`~fugue_spark.dataframe.SparkDataFrame`, it should return itself * For :class:`~spark:pyspark.RDD`, list or iterable of arrays, ``schema`` must be specified * When ``schema`` is not None, a potential type cast may happen to ensure the dataframe's schema. * all other methods in the engine can take arbitrary dataframes and call this method to convert before doing anything """ if isinstance(df, DataFrame): assert_or_throw( schema is None and metadata is None, ValueError("schema and metadata must be None when df is a DataFrame"), ) if isinstance(df, SparkDataFrame): return df if any(pa.types.is_struct(t) for t in df.schema.types): sdf = self.spark_session.createDataFrame( df.as_array(type_safe=True), to_spark_schema(df.schema) ) else: sdf = self.spark_session.createDataFrame( df.as_pandas(), to_spark_schema(df.schema) ) return SparkDataFrame(sdf, df.schema, df.metadata) if isinstance(df, ps.DataFrame): return SparkDataFrame( df, None if schema is None else to_schema(schema), metadata ) if isinstance(df, RDD): assert_arg_not_none(schema, "schema") sdf = self.spark_session.createDataFrame(df, to_spark_schema(schema)) return SparkDataFrame(sdf, to_schema(schema), metadata) if isinstance(df, pd.DataFrame): sdf = self.spark_session.createDataFrame(df) return SparkDataFrame(sdf, schema, metadata) # use arrow dataframe here to handle nulls in int cols adf = ArrowDataFrame(df, to_schema(schema)) sdf = self.spark_session.createDataFrame( adf.as_array(), to_spark_schema(adf.schema) ) return SparkDataFrame(sdf, adf.schema, metadata)
def test_schema_conversion(spark_session): def test(expr): assert to_schema(to_spark_schema(expr)) == expr test("a:int,b:long,c:[int],d:datetime,e:date,f:decimal(3,4),g:str") test("a:{a:[int],b:[str]}") # test("a:[{a:int}]") TODO: this is not supported by spark, should we support? s = to_spark_schema(to_spark_schema("a:int")) assert to_spark_schema(s) is s df = spark_session.createDataFrame([[1]], schema=to_spark_schema("a:int")) assert to_schema(to_spark_schema(df)) == "a:int" assert to_schema(df) == "a:int" assert to_schema(dict(a=str)) == "a:str"
def _load_csv(self, p: List[str], columns: Any = None, **kwargs: Any) -> DataFrame: kw = ParamDict(kwargs) infer_schema = kw.get("infer_schema", False) if infer_schema: kw["inferSchema"] = True if "infer_schema" in kw: del kw["infer_schema"] header = str(kw.get_or_none("header", object)).lower() if "header" in kw: del kw["header"] reader = self._session.read.format("csv") reader.options(**kw) if header == "true": reader.option("header", "true") if columns is None: return SparkDataFrame(reader.load(p)) if isinstance(columns, list): # column names return SparkDataFrame(reader.load(p)[columns]) schema = Schema(columns) return SparkDataFrame(reader.load(p)[schema.names], schema) if header in ["false", "none"]: reader.option("header", "false") if columns is None: raise InvalidOperationError("columns must be set if without header") if isinstance(columns, list): # column names sdf = reader.load(p) inferred = to_schema(sdf) renames = [f"{k} AS {v}" for k, v in zip(inferred.names, columns)] return SparkDataFrame(sdf.selectExpr(*renames)) schema = Schema(columns) sdf = reader.schema(to_spark_schema(schema)).load(p) return SparkDataFrame(sdf, schema) else: raise NotImplementedError(f"{header} is not supported")
def even_repartition(session: SparkSession, df: ps.DataFrame, num: int, cols: List[Any]) -> ps.DataFrame: if num == 1: return _single_repartition(df) if len(cols) == 0: if num == 0: return df rdd = (_zipWithIndex(df.rdd).partitionBy( num, lambda k: k).mapPartitions(_to_rows)) return session.createDataFrame(rdd, df.schema) else: keys = df.select(*cols).distinct() krdd = _zipWithIndex(keys.rdd, True) new_schema = to_spark_schema( to_schema(df.schema).extract(cols) + f"{_PARTITION_DUMMY_KEY}:long") idx = session.createDataFrame(krdd, new_schema) if num <= 0: idx = idx.persist() num = idx.count() idf = (df.alias("df").join(idx.alias("idx"), on=cols, how="inner").select(_PARTITION_DUMMY_KEY, "df.*")) def _to_kv(rows: Iterable[Any]) -> Iterable[Any]: # pragma: no cover for row in rows: yield (row[0], row[1:]) rdd = (idf.rdd.mapPartitions(_to_kv).partitionBy( num, lambda k: k).mapPartitions(_to_rows)) return session.createDataFrame(rdd, df.schema)
def _df(data, schema=None, metadata=None): session = SparkSession.builder.getOrCreate() if schema is not None: pdf = PandasDataFrame(data, to_schema(schema), metadata) return session.createDataFrame(pdf.native, to_spark_schema(schema)) else: return session.createDataFrame(data)
def __init__( # noqa: C901 self, df: Any = None, schema: Any = None, metadata: Any = None ): self._lock = RLock() try: if isinstance(df, ps.DataFrame): if schema is not None: schema = to_schema(schema).assert_not_empty() has_cast, expr = to_cast_expression(df, schema, True) if has_cast: df = df.selectExpr(*expr) else: schema = to_schema(df).assert_not_empty() self._native = df super().__init__(schema, metadata) else: # pragma: no cover assert_or_throw(schema is not None, SchemaError("schema is None")) schema = to_schema(schema).assert_not_empty() raise ValueError(f"{df} is incompatible with SparkDataFrame") except Exception as e: # pragma: no cover raise FugueDataFrameInitError from e
def df(self, data: Any = None, schema: Any = None, metadata: Any = None) -> SparkDataFrame: session = SparkSession.builder.getOrCreate() if data is None: df = None else: if schema is not None: pdf = PandasDataFrame(data, to_schema(schema), metadata) df = session.createDataFrame(pdf.native, to_spark_schema(schema)) else: try: df = session.createDataFrame(data) except Exception: raise FugueDataFrameInitError("schema error") return SparkDataFrame(df, schema, metadata)
def test(expr): assert to_schema(to_spark_schema(expr)) == expr