def _test_as_array_perf(): s = Schema() arr = [] for i in range(100): s.append(f"a{i}:int") arr.append(i) for i in range(100): s.append(f"b{i}:int") arr.append(float(i)) for i in range(100): s.append(f"c{i}:str") arr.append(str(i)) data = [] for i in range(5000): data.append(list(arr)) df = SparkDataFrame(data, s) res = df.as_array() res = df.as_array(type_safe=True) nts, ts = 0.0, 0.0 for i in range(10): t = datetime.now() res = df.as_array() nts += (datetime.now() - t).total_seconds() t = datetime.now() res = df.as_array(type_safe=True) ts += (datetime.now() - t).total_seconds() print(nts, ts)
def test_schema_append(): s = Schema() s.append(pa.field("a", pa.int32())) assert s == "a:int" raises(SchemaError, lambda: s.append("b")) raises(SchemaError, lambda: s.append(123)) s.append("b:str") assert s == "a:int,b:str" s.append(Schema("c:int").pa_schema) assert s == "a:int,b:str,c:int" s.append("") assert s == "a:int,b:str,c:int" s.append(" ") assert s == "a:int,b:str,c:int" df = pd.DataFrame([["a", 1], ["b", 2]], columns=["x", "y"]) assert Schema(df) == "x:str,y:long"