Exemplo n.º 1
0
 def get_dfs(seq):
     for x in seq:
         if x == "e":
             yield IterableDataFrame([], "a:int,b:int")
         if x == "v":
             yield IterableDataFrame([[1, 10]], "a:int,b:int")
         if x == "o":  # bad schema but empty dataframe doesn't matter
             yield ArrayDataFrame([], "a:int,b:str")
Exemplo n.º 2
0
 def as_array_iterable(
     self, columns: Optional[List[str]] = None, type_safe: bool = False
 ) -> Iterable[Any]:
     sdf = self._withColumns(columns)
     if not type_safe:
         for row in to_type_safe_input(sdf.native.rdd.toLocalIterator(), sdf.schema):
             yield row
     else:
         df = IterableDataFrame(sdf.as_array_iterable(type_safe=False), sdf.schema)
         for row in df.as_array_iterable(type_safe=True):
             yield row
Exemplo n.º 3
0
    def to_output_df(self, output: EmptyAwareIterable[Dict[str, Any]],
                     schema: Any) -> DataFrame:
        schema = schema if isinstance(schema, Schema) else Schema(schema)

        def get_all() -> Iterable[List[Any]]:
            for row in output:
                yield [row[x] for x in schema.names]

        return IterableDataFrame(get_all(), schema)
Exemplo n.º 4
0
def _test_as_array_perf():
    s = Schema()
    arr = []
    for i in range(100):
        s.append(f"a{i}:int")
        arr.append(i)
    for i in range(100):
        s.append(f"b{i}:int")
        arr.append(float(i))
    for i in range(100):
        s.append(f"c{i}:str")
        arr.append(str(i))
    data = []
    for i in range(5000):
        data.append(list(arr))
    df = IterableDataFrame(data, s)
    res = df.as_array()
    res = df.as_array(type_safe=True)
    nts, ts = 0.0, 0.0
    for i in range(10):
        t = datetime.now()
        res = df.as_array()
        nts += (datetime.now() - t).total_seconds()
        t = datetime.now()
        res = df.as_array(type_safe=True)
        ts += (datetime.now() - t).total_seconds()
    print(nts, ts)
Exemplo n.º 5
0
 def run(self, no: int, rows: Iterable[ps.Row]) -> Iterable[Any]:
     df = IterableDataFrame(to_type_safe_input(rows, self.schema),
                            self.schema, self.metadata)
     if df.empty:  # pragma: no cover
         return
     cursor = self.partition_spec.get_cursor(self.schema, no)
     if self.on_init is not None:
         self.on_init(no, df)
     if self.partition_spec.empty:
         partitions: Iterable[Tuple[int, int, EmptyAwareIterable]] = [
             (0, 0, df.native)
         ]
     else:
         partitioner = self.partition_spec.get_partitioner(self.schema)
         partitions = partitioner.partition(df.native)
     for pn, sn, sub in partitions:
         cursor.set(sub.peek(), pn, sn)
         sub_df = IterableDataFrame(sub, self.schema)
         sub_df._metadata = self.metadata
         res = self.map_func(cursor, sub_df)
         for r in res.as_array_iterable(type_safe=True):
             yield r
Exemplo n.º 6
0
def test_nested():
    data = [[dict(a=1, b=[3, 4], d=1.0)], [json.dumps(dict(b=[30, "40"]))]]
    df = IterableDataFrame(data, "a:{a:str,b:[int]}")
    a = df.as_array(type_safe=True)
    assert [[dict(a="1", b=[3, 4])], [dict(a=None, b=[30, 40])]] == a

    data = [[[json.dumps(dict(b=[30, "40"]))]]]
    df = IterableDataFrame(data, "a:[{a:str,b:[int]}]")
    a = df.as_array(type_safe=True)
    assert [[[dict(a=None, b=[30, 40])]]] == a
Exemplo n.º 7
0
def f21(e: List[Dict[str, Any]], a: Iterable[Dict[str, Any]]) -> DataFrame:
    e += list(a)
    arr = [[x["a"]] for x in e]
    return IterableDataFrame(arr, "a:int")
Exemplo n.º 8
0
def f20(e: List[List[Any]], a: Iterable[List[Any]]) -> LocalDataFrame:
    e += list(a)
    return IterableDataFrame(e, "a:int")
Exemplo n.º 9
0
 def to_output_df(self, output: EmptyAwareIterable[List[Any]],
                  schema: Any) -> DataFrame:
     return IterableDataFrame(output, schema)
Exemplo n.º 10
0
 def to_output_df(self, output: Iterable[List[Any]], schema: Any,
                  ctx: Any) -> DataFrame:
     return IterableDataFrame(output, schema)
Exemplo n.º 11
0
def test_simple_methods():
    df = IterableDataFrame([["a", 1], ["b", "2"]], "x:str,y:double")
    raises(InvalidOperationError, lambda: df.count())
    assert not df.empty
    assert ["a", 1.0] == df.peek_array()
    assert dict(x="a", y=1.0) == df.peek_dict()
    assert [["a", 1], ["b", "2"]] == df.as_array()

    df = IterableDataFrame([["a", 1], ["b", "2"]], "x:str,y:double")
    pdf = df.as_pandas()
    assert [["a", 1.0], ["b", 2.0]] == pdf.values.tolist()

    df = IterableDataFrame([], "x:str,y:double")
    pdf = df.as_pandas()
    assert [] == pdf.values.tolist()
Exemplo n.º 12
0
def test_init():
    df = IterableDataFrame(schema="a:str,b:int")
    assert df.empty
    assert df.schema == "a:str,b:int"
    assert not df.is_bounded

    data = [["a", 1], ["b", 2]]
    df = IterableDataFrame(data, "a:str,b:str")
    assert [["a", "1"], ["b", "2"]] == df.as_array(type_safe=True)
    assert df.empty  # after iterating all items
    df = IterableDataFrame(data, "a:str,b:int")
    assert [["a", 1], ["b", 2]] == df.as_array(type_safe=True)
    df = IterableDataFrame(data, "a:str,b:double")
    assert [["a", 1.0], ["b", 2.0]] == df.as_array(type_safe=True)

    df = IterableDataFrame(data, "a:str,b:double")
    ddf = IterableDataFrame(df)
    assert [["a", 1.0], ["b", 2.0]] == ddf.as_array(type_safe=True)
    df = IterableDataFrame(data, "a:str,b:double")
    ddf = IterableDataFrame(df, "a:str,b:float64")
    assert [["a", 1.0], ["b", 2.0]] == ddf.as_array(type_safe=True)
    df = IterableDataFrame(data, "a:str,b:double")
    ddf = IterableDataFrame(df, "b:str,a:str")
    assert [["1", "a"], ["2", "b"]] == ddf.as_array(type_safe=True)
    df = IterableDataFrame(data, "a:str,b:double")
    ddf = IterableDataFrame(df, ["b"])
    assert ddf.schema == "b:double"
    assert [[1.0], [2.0]] == ddf.as_array(type_safe=True)
    df = IterableDataFrame(data, "a:str,b:double")
    ddf = IterableDataFrame(df, ["a:str,b:str"])
    assert [["a", "1"], ["b", "2"]] == ddf.as_array(type_safe=True)
    df = IterableDataFrame(data, "a:str,b:double")
    ddf = IterableDataFrame(df, ["b:str"])
    assert [["1"], ["2"]] == ddf.as_array(type_safe=True)

    pdf = PandasDataFrame(data, "a:str,b:double")
    df = IterableDataFrame(pdf, "a:str,b:double")
    assert [["a", 1.0], ["b", 2.0]] == df.as_array(type_safe=True)
    df = IterableDataFrame(pdf, "b:str,a:str")
    assert [["1.0", "a"], ["2.0", "b"]] == df.as_array(type_safe=True)

    df = IterableDataFrame([], "x:str,y:double")
    assert df.empty
    assert df.is_local

    raises(FugueDataFrameInitError, lambda: IterableDataFrame(123))
Exemplo n.º 13
0
 def df(self,
        data: Any = None,
        schema: Any = None,
        metadata: Any = None) -> IterableDataFrame:
     return IterableDataFrame(data, schema, metadata)