示例#1
0
def to_local_df(df: Any, schema: Any = None, metadata: Any = None) -> LocalDataFrame:
    """Convert a data structure to :class:`~fugue.dataframe.dataframe.LocalDataFrame`

    :param df: :class:`~fugue.dataframe.dataframe.DataFrame`, pandas DataFramme and
      list or iterable of arrays
    :param schema: |SchemaLikeObject|, defaults to None, it should not be set for
      :class:`~fugue.dataframe.dataframe.DataFrame` type
    :param metadata: dict-like object with string keys, defaults to  None
    :raises ValueError: if ``df`` is :class:`~fugue.dataframe.dataframe.DataFrame`
      but you set ``schema`` or ``metadata``
    :raises TypeError: if ``df`` is not compatible
    :return: the dataframe itself if it's
      :class:`~fugue.dataframe.dataframe.LocalDataFrame` else a converted one

    :Examples:

    >>> a = to_local_df([[0,'a'],[1,'b']],"a:int,b:str")
    >>> assert to_local_df(a) is a
    >>> to_local_df(SparkDataFrame([[0,'a'],[1,'b']],"a:int,b:str"))
    """
    assert_arg_not_none(df, "df")
    if isinstance(df, DataFrame):
        aot(
            schema is None and metadata is None,
            ValueError("schema and metadata must be None when df is a DataFrame"),
        )
        return df.as_local()
    if isinstance(df, pd.DataFrame):
        return PandasDataFrame(df, schema, metadata)
    if isinstance(df, List):
        return ArrayDataFrame(df, schema, metadata)
    if isinstance(df, Iterable):
        return IterableDataFrame(df, schema, metadata)
    raise TypeError(f"{df} cannot convert to a LocalDataFrame")
示例#2
0
def test_to_local_bounded_df():
    df = ArrayDataFrame([[0, 1]], "a:int,b:int")
    idf = IterableDataFrame([[0, 1]], "a:int,b:int", dict(a=1))
    assert to_local_bounded_df(df) is df
    r = to_local_bounded_df(idf)
    assert r is not idf
    assert r.as_array() == [[0, 1]]
    assert r.schema == "a:int,b:int"
    assert r.metadata == dict(a=1)
示例#3
0
def test_to_local_df():
    df = ArrayDataFrame([[0, 1]], "a:int,b:int")
    pdf = PandasDataFrame(df.as_pandas(), "a:int,b:int")
    idf = IterableDataFrame([[0, 1]], "a:int,b:int")
    assert to_local_df(df) is df
    assert to_local_df(pdf) is pdf
    assert to_local_df(idf) is idf
    assert isinstance(to_local_df(df.native, "a:int,b:int"), ArrayDataFrame)
    assert isinstance(to_local_df(pdf.native, "a:int,b:int"), PandasDataFrame)
    assert isinstance(to_local_df(idf.native, "a:int,b:int"),
                      IterableDataFrame)
    raises(TypeError, lambda: to_local_df(123))

    metadata = dict(a=1)
    assert to_local_df(df.native, df.schema, metadata).metadata == metadata

    raises(NoneArgumentError, lambda: to_local_df(None))
    raises(ValueError, lambda: to_local_df(df, "a:int,b:int", None))
示例#4
0
def test_serialize_df(tmpdir):
    def assert_eq(df, df_expected=None, raw=False):
        if df_expected is None:
            df_expected = df
        df_actual = deserialize_df(serialize_df(df))
        if raw:
            assert df_expected.native == df_actual.native
        else:
            df_eq(df_expected, df_actual, throw=True)

    fs = FileSystem()
    assert deserialize_df(serialize_df(None)) is None
    assert_eq(ArrayDataFrame([], "a:int,b:int"))
    assert_eq(ArrayDataFrame([[None, None]], "a:int,b:int"))
    assert_eq(ArrayDataFrame([[None, "abc"]], "a:int,b:str"))
    assert_eq(ArrayDataFrame([[None, [1, 2], dict(x=1)]],
                             "a:int,b:[int],c:{x:int}"),
              raw=True)
    assert_eq(
        IterableDataFrame([[None, [1, 2], dict(x=1)]],
                          "a:int,b:[int],c:{x:int}"),
        ArrayDataFrame([[None, [1, 2], dict(x=1)]], "a:int,b:[int],c:{x:int}"),
        raw=True,
    )
    assert_eq(PandasDataFrame([[None, None]], "a:int,b:int"))
    assert_eq(PandasDataFrame([[None, "abc"]], "a:int,b:str"))

    raises(
        InvalidOperationError,
        lambda: serialize_df(ArrayDataFrame([], "a:int,b:int"), 0),
    )

    path = os.path.join(tmpdir, "1.pkl")

    df = ArrayDataFrame([[None, None]], "a:int,b:int")
    s = serialize_df(df, 0, path, fs)
    df_eq(df, deserialize_df(s, fs), throw=True)
    df_eq(df, deserialize_df(s), throw=True)

    s = serialize_df(df, 0, path)
    df_eq(df, deserialize_df(s), throw=True)

    raises(ValueError, lambda: deserialize_df('{"x":1}'))
示例#5
0
def test_pickle_df():
    def assert_eq(df, df_expected=None, raw=False):
        if df_expected is None:
            df_expected = df
        df_actual = unpickle_df(pickle_df(df))
        if raw:
            assert df_expected.native == df_actual.native
        else:
            df_eq(df_expected, df_actual, throw=True)

    assert_eq(ArrayDataFrame([], "a:int,b:int"))
    assert_eq(ArrayDataFrame([[None, None]], "a:int,b:int"))
    assert_eq(ArrayDataFrame([[None, "abc"]], "a:int,b:str"))
    assert_eq(ArrayDataFrame([[None, [1, 2], dict(x=1)]],
                             "a:int,b:[int],c:{x:int}"),
              raw=True)
    assert_eq(IterableDataFrame([[None, [1, 2], dict(x=1)]],
                                "a:int,b:[int],c:{x:int}"),
              ArrayDataFrame([[None, [1, 2], dict(x=1)]],
                             "a:int,b:[int],c:{x:int}"),
              raw=True)
    assert_eq(PandasDataFrame([[None, None]], "a:int,b:int"))
    assert_eq(PandasDataFrame([[None, "abc"]], "a:int,b:str"))