Exemplo n.º 1
0
 def __init__(  # noqa: C901
     self,
     df: Any = None,
     schema: Any = None,
     metadata: Any = None,
     pandas_df_wrapper: bool = False,
 ):
     try:
         apply_schema = True
         if df is None:
             schema = _input_schema(schema).assert_not_empty()
             df = []
         if isinstance(df, PandasDataFrame):
             # TODO: This is useless if in this way and wrong
             pdf = df.native
             schema = None
         elif isinstance(df, (pd.DataFrame, pd.Series)):
             if isinstance(df, pd.Series):
                 df = df.to_frame()
             pdf = df
             schema = None if schema is None else _input_schema(schema)
             if pandas_df_wrapper and schema is not None:
                 apply_schema = False
         elif isinstance(df, Iterable):
             schema = _input_schema(schema).assert_not_empty()
             pdf = pd.DataFrame(df, columns=schema.names)
             pdf = PD_UTILS.enforce_type(pdf,
                                         schema.pa_schema,
                                         null_safe=True)
             if PD_UTILS.empty(pdf):
                 for k, v in schema.items():
                     pdf[k] = pdf[k].astype(v.type.to_pandas_dtype())
             apply_schema = False
         else:
             raise ValueError(f"{df} is incompatible with PandasDataFrame")
         if apply_schema:
             pdf, schema = self._apply_schema(pdf, schema)
         super().__init__(schema, metadata)
         self._native = pdf
     except Exception as e:
         raise FugueDataFrameInitError from e
Exemplo n.º 2
0
    def to_df(self,
              df: Any,
              schema: Any = None,
              metadata: Any = None) -> SparkDataFrame:
        """Convert a data structure to :class:`~fugue_spark.dataframe.SparkDataFrame`

        :param data: :class:`~fugue.dataframe.dataframe.DataFrame`,
          :class:`spark:pyspark.sql.DataFrame`, :class:`spark:pyspark.RDD`,
          pandas DataFrame or list or iterable of arrays
        :param schema: |SchemaLikeObject| or :class:`spark:pyspark.sql.types.StructType`
          defaults to None.
        :param metadata: |ParamsLikeObject|, defaults to None
        :return: engine compatible dataframe

        :Notice:

        * if the input is already :class:`~fugue_spark.dataframe.SparkDataFrame`,
          it should return itself
        * For :class:`~spark:pyspark.RDD`, list or iterable of arrays,
          ``schema`` must be specified
        * When ``schema`` is not None, a potential type cast may happen to ensure
          the dataframe's schema.
        * all other methods in the engine can take arbitrary dataframes and
          call this method to convert before doing anything
        """
        if isinstance(df, DataFrame):
            assert_or_throw(
                schema is None and metadata is None,
                ValueError(
                    "schema and metadata must be None when df is a DataFrame"),
            )
            if isinstance(df, SparkDataFrame):
                return df
            if isinstance(df, ArrowDataFrame):
                sdf = self.spark_session.createDataFrame(
                    df.as_array(), to_spark_schema(df.schema))
                return SparkDataFrame(sdf, df.schema, df.metadata)
            if isinstance(df, (ArrayDataFrame, IterableDataFrame)):
                adf = ArrowDataFrame(df.as_array(type_safe=False), df.schema)
                sdf = self.spark_session.createDataFrame(
                    adf.as_array(), to_spark_schema(df.schema))
                return SparkDataFrame(sdf, df.schema, df.metadata)
            if any(pa.types.is_struct(t) for t in df.schema.types):
                sdf = self.spark_session.createDataFrame(
                    df.as_array(type_safe=True), to_spark_schema(df.schema))
            else:
                sdf = self.spark_session.createDataFrame(
                    df.as_pandas(), to_spark_schema(df.schema))
            return SparkDataFrame(sdf, df.schema, df.metadata)
        if isinstance(df, ps.DataFrame):
            return SparkDataFrame(
                df, None if schema is None else to_schema(schema), metadata)
        if isinstance(df, RDD):
            assert_arg_not_none(schema, "schema")
            sdf = self.spark_session.createDataFrame(df,
                                                     to_spark_schema(schema))
            return SparkDataFrame(sdf, to_schema(schema), metadata)
        if isinstance(df, pd.DataFrame):
            if PD_UTILS.empty(df):
                temp_schema = to_spark_schema(PD_UTILS.to_schema(df))
                sdf = self.spark_session.createDataFrame([], temp_schema)
            else:
                sdf = self.spark_session.createDataFrame(df)
            return SparkDataFrame(sdf, schema, metadata)

        # use arrow dataframe here to handle nulls in int cols
        assert_or_throw(schema is not None,
                        FugueDataFrameInitError("schema can't be None"))
        adf = ArrowDataFrame(df, to_schema(schema))
        sdf = self.spark_session.createDataFrame(adf.as_array(),
                                                 to_spark_schema(adf.schema))
        return SparkDataFrame(sdf, adf.schema, metadata)