Пример #1
0
    def to_df(
        self, df: Any, schema: Any = None, metadata: Any = None
    ) -> SparkDataFrame:
        """Convert a data structure to :class:`~fugue_spark.dataframe.SparkDataFrame`

        :param data: :class:`~fugue.dataframe.dataframe.DataFrame`,
          :class:`spark:pyspark.sql.DataFrame`, :class:`spark:pyspark.RDD`,
          pandas DataFrame or list or iterable of arrays
        :param schema: |SchemaLikeObject| or :class:`spark:pyspark.sql.types.StructType`
          defaults to None.
        :param metadata: |ParamsLikeObject|, defaults to None
        :return: engine compatible dataframe

        :Notice:

        * if the input is already :class:`~fugue_spark.dataframe.SparkDataFrame`,
          it should return itself
        * For :class:`~spark:pyspark.RDD`, list or iterable of arrays,
          ``schema`` must be specified
        * When ``schema`` is not None, a potential type cast may happen to ensure
          the dataframe's schema.
        * all other methods in the engine can take arbitrary dataframes and
          call this method to convert before doing anything
        """
        if isinstance(df, DataFrame):
            assert_or_throw(
                schema is None and metadata is None,
                ValueError("schema and metadata must be None when df is a DataFrame"),
            )
            if isinstance(df, SparkDataFrame):
                return df
            if any(pa.types.is_struct(t) for t in df.schema.types):
                sdf = self.spark_session.createDataFrame(
                    df.as_array(type_safe=True), to_spark_schema(df.schema)
                )
            else:
                sdf = self.spark_session.createDataFrame(
                    df.as_pandas(), to_spark_schema(df.schema)
                )
            return SparkDataFrame(sdf, df.schema, df.metadata)
        if isinstance(df, ps.DataFrame):
            return SparkDataFrame(
                df, None if schema is None else to_schema(schema), metadata
            )
        if isinstance(df, RDD):
            assert_arg_not_none(schema, "schema")
            sdf = self.spark_session.createDataFrame(df, to_spark_schema(schema))
            return SparkDataFrame(sdf, to_schema(schema), metadata)
        if isinstance(df, pd.DataFrame):
            sdf = self.spark_session.createDataFrame(df)
            return SparkDataFrame(sdf, schema, metadata)

        # use arrow dataframe here to handle nulls in int cols
        adf = ArrowDataFrame(df, to_schema(schema))
        sdf = self.spark_session.createDataFrame(
            adf.as_array(), to_spark_schema(adf.schema)
        )
        return SparkDataFrame(sdf, adf.schema, metadata)
Пример #2
0
def test_schema_conversion(spark_session):
    def test(expr):
        assert to_schema(to_spark_schema(expr)) == expr

    test("a:int,b:long,c:[int],d:datetime,e:date,f:decimal(3,4),g:str")
    test("a:{a:[int],b:[str]}")
    # test("a:[{a:int}]") TODO: this is not supported by spark, should we support?
    s = to_spark_schema(to_spark_schema("a:int"))
    assert to_spark_schema(s) is s

    df = spark_session.createDataFrame([[1]], schema=to_spark_schema("a:int"))
    assert to_schema(to_spark_schema(df)) == "a:int"
    assert to_schema(df) == "a:int"
    assert to_schema(dict(a=str)) == "a:str"
Пример #3
0
 def _load_csv(self, p: List[str], columns: Any = None, **kwargs: Any) -> DataFrame:
     kw = ParamDict(kwargs)
     infer_schema = kw.get("infer_schema", False)
     if infer_schema:
         kw["inferSchema"] = True
     if "infer_schema" in kw:
         del kw["infer_schema"]
     header = str(kw.get_or_none("header", object)).lower()
     if "header" in kw:
         del kw["header"]
     reader = self._session.read.format("csv")
     reader.options(**kw)
     if header == "true":
         reader.option("header", "true")
         if columns is None:
             return SparkDataFrame(reader.load(p))
         if isinstance(columns, list):  # column names
             return SparkDataFrame(reader.load(p)[columns])
         schema = Schema(columns)
         return SparkDataFrame(reader.load(p)[schema.names], schema)
     if header in ["false", "none"]:
         reader.option("header", "false")
         if columns is None:
             raise InvalidOperationError("columns must be set if without header")
         if isinstance(columns, list):  # column names
             sdf = reader.load(p)
             inferred = to_schema(sdf)
             renames = [f"{k} AS {v}" for k, v in zip(inferred.names, columns)]
             return SparkDataFrame(sdf.selectExpr(*renames))
         schema = Schema(columns)
         sdf = reader.schema(to_spark_schema(schema)).load(p)
         return SparkDataFrame(sdf, schema)
     else:
         raise NotImplementedError(f"{header} is not supported")
Пример #4
0
def even_repartition(session: SparkSession, df: ps.DataFrame, num: int,
                     cols: List[Any]) -> ps.DataFrame:
    if num == 1:
        return _single_repartition(df)
    if len(cols) == 0:
        if num == 0:
            return df
        rdd = (_zipWithIndex(df.rdd).partitionBy(
            num, lambda k: k).mapPartitions(_to_rows))
        return session.createDataFrame(rdd, df.schema)
    else:
        keys = df.select(*cols).distinct()
        krdd = _zipWithIndex(keys.rdd, True)
        new_schema = to_spark_schema(
            to_schema(df.schema).extract(cols) +
            f"{_PARTITION_DUMMY_KEY}:long")
        idx = session.createDataFrame(krdd, new_schema)
        if num <= 0:
            idx = idx.persist()
            num = idx.count()
        idf = (df.alias("df").join(idx.alias("idx"), on=cols,
                                   how="inner").select(_PARTITION_DUMMY_KEY,
                                                       "df.*"))

        def _to_kv(rows: Iterable[Any]) -> Iterable[Any]:  # pragma: no cover
            for row in rows:
                yield (row[0], row[1:])

        rdd = (idf.rdd.mapPartitions(_to_kv).partitionBy(
            num, lambda k: k).mapPartitions(_to_rows))
        return session.createDataFrame(rdd, df.schema)
Пример #5
0
def _df(data, schema=None, metadata=None):
    session = SparkSession.builder.getOrCreate()
    if schema is not None:
        pdf = PandasDataFrame(data, to_schema(schema), metadata)
        return session.createDataFrame(pdf.native, to_spark_schema(schema))
    else:
        return session.createDataFrame(data)
Пример #6
0
 def __init__(  # noqa: C901
     self, df: Any = None, schema: Any = None, metadata: Any = None
 ):
     self._lock = RLock()
     try:
         if isinstance(df, ps.DataFrame):
             if schema is not None:
                 schema = to_schema(schema).assert_not_empty()
                 has_cast, expr = to_cast_expression(df, schema, True)
                 if has_cast:
                     df = df.selectExpr(*expr)
             else:
                 schema = to_schema(df).assert_not_empty()
             self._native = df
             super().__init__(schema, metadata)
         else:  # pragma: no cover
             assert_or_throw(schema is not None, SchemaError("schema is None"))
             schema = to_schema(schema).assert_not_empty()
             raise ValueError(f"{df} is incompatible with SparkDataFrame")
     except Exception as e:  # pragma: no cover
         raise FugueDataFrameInitError from e
Пример #7
0
 def df(self,
        data: Any = None,
        schema: Any = None,
        metadata: Any = None) -> SparkDataFrame:
     session = SparkSession.builder.getOrCreate()
     if data is None:
         df = None
     else:
         if schema is not None:
             pdf = PandasDataFrame(data, to_schema(schema), metadata)
             df = session.createDataFrame(pdf.native,
                                          to_spark_schema(schema))
         else:
             try:
                 df = session.createDataFrame(data)
             except Exception:
                 raise FugueDataFrameInitError("schema error")
     return SparkDataFrame(df, schema, metadata)
Пример #8
0
 def test(expr):
     assert to_schema(to_spark_schema(expr)) == expr