예제 #1
0
def test_nested(spark_session):
    # data = [[dict(a=1, b=[3, 4], d=1.0)], [json.dumps(dict(b=[30, "40"]))]]
    # df = SparkDataFrame(data, "a:{a:str,b:[int]}")
    # a = df.as_array(type_safe=True)
    # assert [[dict(a="1", b=[3, 4])], [dict(a=None, b=[30, 40])]] == a

    data = [[[10, 20]]]
    sdf = spark_session.createDataFrame(data, to_spark_schema("a:[int]"))
    df = SparkDataFrame(sdf)
    assert data == df.as_array(type_safe=False)
    assert data == df.as_array(type_safe=True)
    assert data == list(df.as_array_iterable(type_safe=False))
    assert data == list(df.as_array_iterable(type_safe=True))

    data = [[dict(b=[30, 40])]]
    sdf = spark_session.createDataFrame(data,
                                        to_spark_schema("a:{a:str,b:[int]}"))
    df = SparkDataFrame(sdf)
    a = df.as_array(type_safe=False)
    assert [[dict(a=None, b=[30, 40])]] == a
    a = df.as_array(type_safe=True)
    assert [[dict(a=None, b=[30, 40])]] == a
    a = list(df.as_array_iterable(type_safe=False))
    assert [[dict(a=None, b=[30, 40])]] == a
    a = list(df.as_array_iterable(type_safe=True))
    assert [[dict(a=None, b=[30, 40])]] == a
예제 #2
0
    def to_df(
        self, df: Any, schema: Any = None, metadata: Any = None
    ) -> SparkDataFrame:
        """Convert a data structure to :class:`~fugue_spark.dataframe.SparkDataFrame`

        :param data: :class:`~fugue.dataframe.dataframe.DataFrame`,
          :class:`spark:pyspark.sql.DataFrame`, :class:`spark:pyspark.RDD`,
          pandas DataFrame or list or iterable of arrays
        :param schema: |SchemaLikeObject| or :class:`spark:pyspark.sql.types.StructType`
          defaults to None.
        :param metadata: |ParamsLikeObject|, defaults to None
        :return: engine compatible dataframe

        :Notice:

        * if the input is already :class:`~fugue_spark.dataframe.SparkDataFrame`,
          it should return itself
        * For :class:`~spark:pyspark.RDD`, list or iterable of arrays,
          ``schema`` must be specified
        * When ``schema`` is not None, a potential type cast may happen to ensure
          the dataframe's schema.
        * all other methods in the engine can take arbitrary dataframes and
          call this method to convert before doing anything
        """
        if isinstance(df, DataFrame):
            assert_or_throw(
                schema is None and metadata is None,
                ValueError("schema and metadata must be None when df is a DataFrame"),
            )
            if isinstance(df, SparkDataFrame):
                return df
            if any(pa.types.is_struct(t) for t in df.schema.types):
                sdf = self.spark_session.createDataFrame(
                    df.as_array(type_safe=True), to_spark_schema(df.schema)
                )
            else:
                sdf = self.spark_session.createDataFrame(
                    df.as_pandas(), to_spark_schema(df.schema)
                )
            return SparkDataFrame(sdf, df.schema, df.metadata)
        if isinstance(df, ps.DataFrame):
            return SparkDataFrame(
                df, None if schema is None else to_schema(schema), metadata
            )
        if isinstance(df, RDD):
            assert_arg_not_none(schema, "schema")
            sdf = self.spark_session.createDataFrame(df, to_spark_schema(schema))
            return SparkDataFrame(sdf, to_schema(schema), metadata)
        if isinstance(df, pd.DataFrame):
            sdf = self.spark_session.createDataFrame(df)
            return SparkDataFrame(sdf, schema, metadata)

        # use arrow dataframe here to handle nulls in int cols
        adf = ArrowDataFrame(df, to_schema(schema))
        sdf = self.spark_session.createDataFrame(
            adf.as_array(), to_spark_schema(adf.schema)
        )
        return SparkDataFrame(sdf, adf.schema, metadata)
예제 #3
0
def test_schema_conversion(spark_session):
    def test(expr):
        assert to_schema(to_spark_schema(expr)) == expr

    test("a:int,b:long,c:[int],d:datetime,e:date,f:decimal(3,4),g:str")
    test("a:{a:[int],b:[str]}")
    # test("a:[{a:int}]") TODO: this is not supported by spark, should we support?
    s = to_spark_schema(to_spark_schema("a:int"))
    assert to_spark_schema(s) is s

    df = spark_session.createDataFrame([[1]], schema=to_spark_schema("a:int"))
    assert to_schema(to_spark_schema(df)) == "a:int"
    assert to_schema(df) == "a:int"
    assert to_schema(dict(a=str)) == "a:str"
예제 #4
0
 def _load_csv(self, p: List[str], columns: Any = None, **kwargs: Any) -> DataFrame:
     kw = ParamDict(kwargs)
     infer_schema = kw.get("infer_schema", False)
     if infer_schema:
         kw["inferSchema"] = True
     if "infer_schema" in kw:
         del kw["infer_schema"]
     header = str(kw.get_or_none("header", object)).lower()
     if "header" in kw:
         del kw["header"]
     reader = self._session.read.format("csv")
     reader.options(**kw)
     if header == "true":
         reader.option("header", "true")
         if columns is None:
             return SparkDataFrame(reader.load(p))
         if isinstance(columns, list):  # column names
             return SparkDataFrame(reader.load(p)[columns])
         schema = Schema(columns)
         return SparkDataFrame(reader.load(p)[schema.names], schema)
     if header in ["false", "none"]:
         reader.option("header", "false")
         if columns is None:
             raise InvalidOperationError("columns must be set if without header")
         if isinstance(columns, list):  # column names
             sdf = reader.load(p)
             inferred = to_schema(sdf)
             renames = [f"{k} AS {v}" for k, v in zip(inferred.names, columns)]
             return SparkDataFrame(sdf.selectExpr(*renames))
         schema = Schema(columns)
         sdf = reader.schema(to_spark_schema(schema)).load(p)
         return SparkDataFrame(sdf, schema)
     else:
         raise NotImplementedError(f"{header} is not supported")
예제 #5
0
def even_repartition(session: SparkSession, df: ps.DataFrame, num: int,
                     cols: List[Any]) -> ps.DataFrame:
    if num == 1:
        return _single_repartition(df)
    if len(cols) == 0:
        if num == 0:
            return df
        rdd = (_zipWithIndex(df.rdd).partitionBy(
            num, lambda k: k).mapPartitions(_to_rows))
        return session.createDataFrame(rdd, df.schema)
    else:
        keys = df.select(*cols).distinct()
        krdd = _zipWithIndex(keys.rdd, True)
        new_schema = to_spark_schema(
            to_schema(df.schema).extract(cols) +
            f"{_PARTITION_DUMMY_KEY}:long")
        idx = session.createDataFrame(krdd, new_schema)
        if num <= 0:
            idx = idx.persist()
            num = idx.count()
        idf = (df.alias("df").join(idx.alias("idx"), on=cols,
                                   how="inner").select(_PARTITION_DUMMY_KEY,
                                                       "df.*"))

        def _to_kv(rows: Iterable[Any]) -> Iterable[Any]:  # pragma: no cover
            for row in rows:
                yield (row[0], row[1:])

        rdd = (idf.rdd.mapPartitions(_to_kv).partitionBy(
            num, lambda k: k).mapPartitions(_to_rows))
        return session.createDataFrame(rdd, df.schema)
예제 #6
0
def _df(data, schema=None, metadata=None):
    session = SparkSession.builder.getOrCreate()
    if schema is not None:
        pdf = PandasDataFrame(data, to_schema(schema), metadata)
        return session.createDataFrame(pdf.native, to_spark_schema(schema))
    else:
        return session.createDataFrame(data)
예제 #7
0
    def _map_by_pandas_udf(
        self,
        df: DataFrame,
        map_func: Callable[[PartitionCursor, LocalDataFrame], LocalDataFrame],
        output_schema: Any,
        partition_spec: PartitionSpec,
        metadata: Any = None,
        on_init: Optional[Callable[[int, DataFrame], Any]] = None,
    ) -> DataFrame:
        df = self.to_df(self.repartition(df, partition_spec))
        output_schema = Schema(output_schema)
        input_schema = df.schema
        on_init_once: Any = (
            None
            if on_init is None
            else RunOnce(
                on_init, lambda *args, **kwargs: to_uuid(id(on_init), id(args[0]))
            )
        )

        def _udf(
            dfs: Iterable[pd.DataFrame],
        ) -> Iterable[pd.DataFrame]:  # pragma: no cover
            def get_dfs() -> Iterable[LocalDataFrame]:
                for df in dfs:
                    if df.shape[0] > 0:
                        yield PandasDataFrame(
                            df.reset_index(drop=True),
                            input_schema,
                            pandas_df_wrapper=True,
                        )

            input_df = LocalDataFrameIterableDataFrame(get_dfs(), input_schema)
            if input_df.empty:
                return PandasDataFrame([], output_schema).as_pandas()
            if on_init_once is not None:
                on_init_once(0, input_df)
            cursor = partition_spec.get_cursor(input_schema, 0)
            cursor.set(input_df.peek_array(), 0, 0)
            output_df = map_func(cursor, input_df)
            if isinstance(output_df, LocalDataFrameIterableDataFrame):
                for res in output_df.native:
                    yield res.as_pandas()
            else:
                yield output_df.as_pandas()

        df = self.to_df(df)
        sdf = df.native.mapInPandas(_udf, schema=to_spark_schema(output_schema))
        return SparkDataFrame(sdf, metadata=metadata)
예제 #8
0
 def df(self,
        data: Any = None,
        schema: Any = None,
        metadata: Any = None) -> SparkDataFrame:
     session = SparkSession.builder.getOrCreate()
     if data is None:
         df = None
     else:
         if schema is not None:
             pdf = PandasDataFrame(data, to_schema(schema), metadata)
             df = session.createDataFrame(pdf.native,
                                          to_spark_schema(schema))
         else:
             try:
                 df = session.createDataFrame(data)
             except Exception:
                 raise FugueDataFrameInitError("schema error")
     return SparkDataFrame(df, schema, metadata)
예제 #9
0
    def _group_map_by_pandas_udf(
        self,
        df: DataFrame,
        map_func: Callable[[PartitionCursor, LocalDataFrame], LocalDataFrame],
        output_schema: Any,
        partition_spec: PartitionSpec,
        metadata: Any = None,
        on_init: Optional[Callable[[int, DataFrame], Any]] = None,
    ) -> DataFrame:
        presort = partition_spec.presort
        presort_keys = list(presort.keys())
        presort_asc = list(presort.values())
        output_schema = Schema(output_schema)
        input_schema = df.schema
        on_init_once: Any = (
            None
            if on_init is None
            else RunOnce(
                on_init, lambda *args, **kwargs: to_uuid(id(on_init), id(args[0]))
            )
        )

        def _udf(pdf: Any) -> pd.DataFrame:  # pragma: no cover
            if pdf.shape[0] == 0:
                return PandasDataFrame([], output_schema).as_pandas()
            if len(presort_keys) > 0:
                pdf = pdf.sort_values(presort_keys, ascending=presort_asc)
            input_df = PandasDataFrame(
                pdf.reset_index(drop=True), input_schema, pandas_df_wrapper=True
            )
            if on_init_once is not None:
                on_init_once(0, input_df)
            cursor = partition_spec.get_cursor(input_schema, 0)
            cursor.set(input_df.peek_array(), 0, 0)
            output_df = map_func(cursor, input_df)
            return output_df.as_pandas()

        df = self.to_df(df)

        gdf = df.native.groupBy(*partition_spec.partition_by)
        sdf = gdf.applyInPandas(_udf, schema=to_spark_schema(output_schema))
        return SparkDataFrame(sdf, metadata=metadata)
예제 #10
0
 def test(expr):
     assert to_schema(to_spark_schema(expr)) == expr