def _load_csv(self, p: List[str], columns: Any = None, **kwargs: Any) -> DataFrame: kw = ParamDict(kwargs) infer_schema = kw.get("infer_schema", False) if infer_schema: kw["inferSchema"] = True if "infer_schema" in kw: del kw["infer_schema"] header = str(kw.get_or_none("header", object)).lower() if "header" in kw: del kw["header"] reader = self._session.read.format("csv") reader.options(**kw) if header == "true": reader.option("header", "true") if columns is None: return SparkDataFrame(reader.load(p)) if isinstance(columns, list): # column names return SparkDataFrame(reader.load(p)[columns]) schema = Schema(columns) return SparkDataFrame(reader.load(p)[schema.names], schema) if header in ["false", "none"]: reader.option("header", "false") if columns is None: raise InvalidOperationError("columns must be set if without header") if isinstance(columns, list): # column names sdf = reader.load(p) inferred = to_schema(sdf) renames = [f"{k} AS {v}" for k, v in zip(inferred.names, columns)] return SparkDataFrame(sdf.selectExpr(*renames)) schema = Schema(columns) sdf = reader.schema(to_spark_schema(schema)).load(p) return SparkDataFrame(sdf, schema) else: raise NotImplementedError(f"{header} is not supported")
def to_df( self, df: Any, schema: Any = None, metadata: Any = None ) -> SparkDataFrame: """Convert a data structure to :class:`~fugue_spark.dataframe.SparkDataFrame` :param data: :class:`~fugue.dataframe.dataframe.DataFrame`, :class:`spark:pyspark.sql.DataFrame`, :class:`spark:pyspark.RDD`, pandas DataFrame or list or iterable of arrays :param schema: |SchemaLikeObject| or :class:`spark:pyspark.sql.types.StructType` defaults to None. :param metadata: |ParamsLikeObject|, defaults to None :return: engine compatible dataframe :Notice: * if the input is already :class:`~fugue_spark.dataframe.SparkDataFrame`, it should return itself * For :class:`~spark:pyspark.RDD`, list or iterable of arrays, ``schema`` must be specified * When ``schema`` is not None, a potential type cast may happen to ensure the dataframe's schema. * all other methods in the engine can take arbitrary dataframes and call this method to convert before doing anything """ if isinstance(df, DataFrame): assert_or_throw( schema is None and metadata is None, ValueError("schema and metadata must be None when df is a DataFrame"), ) if isinstance(df, SparkDataFrame): return df if any(pa.types.is_struct(t) for t in df.schema.types): sdf = self.spark_session.createDataFrame( df.as_array(type_safe=True), to_spark_schema(df.schema) ) else: sdf = self.spark_session.createDataFrame( df.as_pandas(), to_spark_schema(df.schema) ) return SparkDataFrame(sdf, df.schema, df.metadata) if isinstance(df, ps.DataFrame): return SparkDataFrame( df, None if schema is None else to_schema(schema), metadata ) if isinstance(df, RDD): assert_arg_not_none(schema, "schema") sdf = self.spark_session.createDataFrame(df, to_spark_schema(schema)) return SparkDataFrame(sdf, to_schema(schema), metadata) if isinstance(df, pd.DataFrame): sdf = self.spark_session.createDataFrame(df) return SparkDataFrame(sdf, schema, metadata) # use arrow dataframe here to handle nulls in int cols adf = ArrowDataFrame(df, to_schema(schema)) sdf = self.spark_session.createDataFrame( adf.as_array(), to_spark_schema(adf.schema) ) return SparkDataFrame(sdf, adf.schema, metadata)
def _load_json(self, p: List[str], columns: Any = None, **kwargs: Any) -> DataFrame: reader = self._session.read.format("json") reader.options(**kwargs) if columns is None: return SparkDataFrame(reader.load(p)) if isinstance(columns, list): # column names return SparkDataFrame(reader.load(p))[columns] schema = Schema(columns) return SparkDataFrame(reader.load(p)[schema.names], schema)
def _load_parquet( self, p: List[str], columns: Any = None, **kwargs: Any ) -> DataFrame: sdf = self._session.read.parquet(*p, **kwargs) if columns is None: return SparkDataFrame(sdf) if isinstance(columns, list): # column names return SparkDataFrame(sdf)[columns] schema = Schema(columns) return SparkDataFrame(sdf[schema.names], schema)
def _load_avro(self, p: List[str], columns: Any = None, **kwargs: Any) -> DataFrame: reader = self._session.read.format( "avro" ) # avro is an external data source that has built-in support since spark 2.4 reader.options(**kwargs) if columns is None: return SparkDataFrame(reader.load(p)) if isinstance(columns, list): # column names return SparkDataFrame(reader.load(p))[columns] schema = Schema(columns) return SparkDataFrame(reader.load(p)[schema.names], schema)
def _df(data, schema=None, metadata=None): session = SparkSession.builder.getOrCreate() if schema is not None: pdf = PandasDataFrame(data, to_schema(schema), metadata) df = session.createDataFrame(pdf.native, to_spark_schema(schema)) else: df = session.createDataFrame(data) return SparkDataFrame(df, schema, metadata)
def _persist(self, df: SparkDataFrame, lazy: bool, level: Any) -> SparkDataFrame: if level is None: level = StorageLevel.MEMORY_AND_DISK if isinstance(level, str) and level in StorageLevel.__dict__: level = StorageLevel.__dict__[level] if isinstance(level, StorageLevel): df.native.persist() if not lazy: ct = df.count() self.log.info("Persist dataframe with %s, count %i", level, ct) return df raise ValueError(f"{level} is not supported persist type") # pragma: no cover
def _test_as_array_perf(): s = Schema() arr = [] for i in range(100): s.append(f"a{i}:int") arr.append(i) for i in range(100): s.append(f"b{i}:int") arr.append(float(i)) for i in range(100): s.append(f"c{i}:str") arr.append(str(i)) data = [] for i in range(5000): data.append(list(arr)) df = SparkDataFrame(data, s) res = df.as_array() res = df.as_array(type_safe=True) nts, ts = 0.0, 0.0 for i in range(10): t = datetime.now() res = df.as_array() nts += (datetime.now() - t).total_seconds() t = datetime.now() res = df.as_array(type_safe=True) ts += (datetime.now() - t).total_seconds() print(nts, ts)
def _map_by_pandas_udf( self, df: DataFrame, map_func: Callable[[PartitionCursor, LocalDataFrame], LocalDataFrame], output_schema: Any, partition_spec: PartitionSpec, metadata: Any = None, on_init: Optional[Callable[[int, DataFrame], Any]] = None, ) -> DataFrame: df = self.to_df(self.repartition(df, partition_spec)) output_schema = Schema(output_schema) input_schema = df.schema on_init_once: Any = ( None if on_init is None else RunOnce( on_init, lambda *args, **kwargs: to_uuid(id(on_init), id(args[0])) ) ) def _udf( dfs: Iterable[pd.DataFrame], ) -> Iterable[pd.DataFrame]: # pragma: no cover def get_dfs() -> Iterable[LocalDataFrame]: for df in dfs: if df.shape[0] > 0: yield PandasDataFrame( df.reset_index(drop=True), input_schema, pandas_df_wrapper=True, ) input_df = LocalDataFrameIterableDataFrame(get_dfs(), input_schema) if input_df.empty: return PandasDataFrame([], output_schema).as_pandas() if on_init_once is not None: on_init_once(0, input_df) cursor = partition_spec.get_cursor(input_schema, 0) cursor.set(input_df.peek_array(), 0, 0) output_df = map_func(cursor, input_df) if isinstance(output_df, LocalDataFrameIterableDataFrame): for res in output_df.native: yield res.as_pandas() else: yield output_df.as_pandas() df = self.to_df(df) sdf = df.native.mapInPandas(_udf, schema=to_spark_schema(output_schema)) return SparkDataFrame(sdf, metadata=metadata)
def select( self, dfs: DataFrames, ibis_func: Callable[[ibis.BaseBackend], ir.TableExpr]) -> DataFrame: for k, v in dfs.items(): self.execution_engine.register(v, k) # type: ignore con = ibis.pyspark.connect( self.execution_engine.spark_session) # type: ignore expr = ibis_func(con) schema = to_schema(expr.schema()) result = expr.compile() assert_or_throw( isinstance(result, PySparkDataFrame), lambda: ValueError( f"result must be a PySpark DataFrame ({type(result)})"), ) return SparkDataFrame(result, schema=schema)
def df(self, data: Any = None, schema: Any = None, metadata: Any = None) -> SparkDataFrame: session = SparkSession.builder.getOrCreate() if data is None: df = None else: if schema is not None: pdf = PandasDataFrame(data, to_schema(schema), metadata) df = session.createDataFrame(pdf.native, to_spark_schema(schema)) else: try: df = session.createDataFrame(data) except Exception: raise FugueDataFrameInitError("schema error") return SparkDataFrame(df, schema, metadata)
def save_df( self, df: SparkDataFrame, uri: str, format_hint: Optional[str] = None, partition_spec: PartitionSpec = EMPTY_PARTITION_SPEC, mode: str = "overwrite", force_single: bool = False, **kwargs: Any, ) -> None: if not force_single: p = FileParser(uri, format_hint) writer = self._get_writer(df.native, partition_spec) writer.format(p.file_format).options(**kwargs).mode(mode) writer.save(uri) else: ldf = df.as_local() save_df(ldf, uri, format_hint=format_hint, mode=mode, fs=self._fs, **kwargs)
def test_init(spark_session): sdf = spark_session.createDataFrame([["a", 1]]) df = SparkDataFrame(sdf, "a:str,b:double") assert [["a", 1.0]] == df.as_array() assert [["a", 1.0]] == df.as_pandas().values.tolist() assert not df.is_local assert df.is_bounded assert df.num_partitions > 0 df = _df([["a", 1], ["b", 2]]) assert [["a", 1], ["b", 2]] == df.as_array() df = _df([], "a:str,b:str") assert [] == df.as_array() assert df.schema == "a:str,b:str" df = _df([["a", 1], ["b", 2]], "a:str,b:str") assert [["a", "1"], ["b", "2"]] == df.as_array() assert df.schema == "a:str,b:str"
def _group_map_by_pandas_udf( self, df: DataFrame, map_func: Callable[[PartitionCursor, LocalDataFrame], LocalDataFrame], output_schema: Any, partition_spec: PartitionSpec, metadata: Any = None, on_init: Optional[Callable[[int, DataFrame], Any]] = None, ) -> DataFrame: presort = partition_spec.presort presort_keys = list(presort.keys()) presort_asc = list(presort.values()) output_schema = Schema(output_schema) input_schema = df.schema on_init_once: Any = ( None if on_init is None else RunOnce( on_init, lambda *args, **kwargs: to_uuid(id(on_init), id(args[0])) ) ) def _udf(pdf: Any) -> pd.DataFrame: # pragma: no cover if pdf.shape[0] == 0: return PandasDataFrame([], output_schema).as_pandas() if len(presort_keys) > 0: pdf = pdf.sort_values(presort_keys, ascending=presort_asc) input_df = PandasDataFrame( pdf.reset_index(drop=True), input_schema, pandas_df_wrapper=True ) if on_init_once is not None: on_init_once(0, input_df) cursor = partition_spec.get_cursor(input_schema, 0) cursor.set(input_df.peek_array(), 0, 0) output_df = map_func(cursor, input_df) return output_df.as_pandas() df = self.to_df(df) gdf = df.native.groupBy(*partition_spec.partition_by) sdf = gdf.applyInPandas(_udf, schema=to_spark_schema(output_schema)) return SparkDataFrame(sdf, metadata=metadata)
def test_nested(spark_session): # data = [[dict(a=1, b=[3, 4], d=1.0)], [json.dumps(dict(b=[30, "40"]))]] # df = SparkDataFrame(data, "a:{a:str,b:[int]}") # a = df.as_array(type_safe=True) # assert [[dict(a="1", b=[3, 4])], [dict(a=None, b=[30, 40])]] == a data = [[[10, 20]]] sdf = spark_session.createDataFrame(data, to_spark_schema("a:[int]")) df = SparkDataFrame(sdf) assert data == df.as_array(type_safe=False) assert data == df.as_array(type_safe=True) assert data == list(df.as_array_iterable(type_safe=False)) assert data == list(df.as_array_iterable(type_safe=True)) data = [[dict(b=[30, 40])]] sdf = spark_session.createDataFrame(data, to_spark_schema("a:{a:str,b:[int]}")) df = SparkDataFrame(sdf) a = df.as_array(type_safe=False) assert [[dict(a=None, b=[30, 40])]] == a a = df.as_array(type_safe=True) assert [[dict(a=None, b=[30, 40])]] == a a = list(df.as_array_iterable(type_safe=False)) assert [[dict(a=None, b=[30, 40])]] == a a = list(df.as_array_iterable(type_safe=True)) assert [[dict(a=None, b=[30, 40])]] == a
def select(self, dfs: DataFrames, statement: str) -> DataFrame: for k, v in dfs.items(): self.execution_engine.register(v, k) # type: ignore return SparkDataFrame( self.execution_engine.spark_session.sql(statement) # type: ignore )
def _broadcast(self, df: SparkDataFrame) -> SparkDataFrame: sdf = broadcast(df.native) return SparkDataFrame(sdf, df.schema, df.metadata)