Пример #1
0
def _test_as_array_perf():
    s = Schema()
    arr = []
    for i in range(100):
        s.append(f"a{i}:int")
        arr.append(i)
    for i in range(100):
        s.append(f"b{i}:int")
        arr.append(float(i))
    for i in range(100):
        s.append(f"c{i}:str")
        arr.append(str(i))
    data = []
    for i in range(5000):
        data.append(list(arr))
    df = SparkDataFrame(data, s)
    res = df.as_array()
    res = df.as_array(type_safe=True)
    nts, ts = 0.0, 0.0
    for i in range(10):
        t = datetime.now()
        res = df.as_array()
        nts += (datetime.now() - t).total_seconds()
        t = datetime.now()
        res = df.as_array(type_safe=True)
        ts += (datetime.now() - t).total_seconds()
    print(nts, ts)
Пример #2
0
def test_schema_append():
    s = Schema()
    s.append(pa.field("a", pa.int32()))
    assert s == "a:int"
    raises(SchemaError, lambda: s.append("b"))
    raises(SchemaError, lambda: s.append(123))
    s.append("b:str")
    assert s == "a:int,b:str"
    s.append(Schema("c:int").pa_schema)
    assert s == "a:int,b:str,c:int"
    s.append("")
    assert s == "a:int,b:str,c:int"
    s.append(" ")
    assert s == "a:int,b:str,c:int"
    df = pd.DataFrame([["a", 1], ["b", 2]], columns=["x", "y"])
    assert Schema(df) == "x:str,y:long"