def test_table_arrow_loads_dictionary_stream_int8(self, util): data = [ ([0, 1, 1, None], ["abc", "def"]), ([0, 1, None, 2], ["xx", "yy", "zz"]) ] types = [[pa.int8(), pa.string()]] * 2 arrow_data = util.make_dictionary_arrow(["a", "b"], data, types=types) tbl = Table(arrow_data) assert tbl.size() == 4 assert tbl.schema() == { "a": str, "b": str } assert tbl.view().to_dict() == { "a": ["abc", "def", "def", None], "b": ["xx", "yy", None, "zz"] }
def test_table_mixed_schema(self): data = { "a": int, "b": float, "c": str, "d": bool, "e": date, "f": datetime } tbl = Table(data) assert tbl.schema() == { "a": int, "b": float, "c": str, "d": bool, "e": date, "f": datetime }
def test_table_output_string_schema(self): data = { "a": int, "b": float, "c": str, "d": bool, "e": date, "f": datetime } tbl = Table(data) assert tbl.schema(True) == { "a": "integer", "b": "float", "c": "string", "d": "boolean", "e": "date", "f": "datetime" }
def test_table_time_series(self, util): data = util.make_series(freq="H") tbl = Table(data) assert tbl.size() == 10 assert tbl.schema() == { "index": datetime, "0": float } assert tbl.view().to_dict()["index"] == [ datetime(2000, 1, 1, 0, 0, 0), datetime(2000, 1, 1, 1, 0, 0), datetime(2000, 1, 1, 2, 0, 0), datetime(2000, 1, 1, 3, 0, 0), datetime(2000, 1, 1, 4, 0, 0), datetime(2000, 1, 1, 5, 0, 0), datetime(2000, 1, 1, 6, 0, 0), datetime(2000, 1, 1, 7, 0, 0), datetime(2000, 1, 1, 8, 0, 0), datetime(2000, 1, 1, 9, 0, 0) ]
def test_table_output_readable_schema(self): data = { "a": "int32", "b": "float64", "c": "str", "d": "bool", "e": "date", "f": "datetime" } tbl = Table(data) assert tbl.schema() == { "a": int, "b": float, "c": str, "d": bool, "e": date, "f": datetime }
def test_table_pandas_periodindex(self, util): df = util.make_period_dataframe(30) tbl = Table(df) assert tbl.size() == 30 assert tbl.schema() == { "index": date, "a": float, "b": float, "c": float, "d": float } assert tbl.view().to_dict()["index"][:5] == [ datetime(2000, 1, 1), datetime(2000, 2, 1), datetime(2000, 3, 1), datetime(2000, 4, 1), datetime(2000, 5, 1) ]
def test_table_date_series(self, util): data = util.make_series(freq="D") tbl = Table(data) assert tbl.size() == 10 assert tbl.schema() == { "index": date, "0": float } assert tbl.view().to_dict()["index"] == [ datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3), datetime(2000, 1, 4), datetime(2000, 1, 5), datetime(2000, 1, 6), datetime(2000, 1, 7), datetime(2000, 1, 8), datetime(2000, 1, 9), datetime(2000, 1, 10) ]
def test_table_np_datetime_Y(self): tbl = Table({ "a": np.array([ datetime(2017, 5, 12, 11, 0), datetime(2018, 6, 12, 11, 0), datetime(2019, 7, 12, 11, 0) ], dtype="datetime64[Y]") }) assert tbl.schema() == {"a": date} assert tbl.view().to_dict() == { "a": [ datetime(2017, 1, 1, 0, 0), datetime(2018, 1, 1, 0, 0), datetime(2019, 1, 1, 0, 0) ] }
def test_table_dataframe_minute_index(self, util): data = util.make_dataframe(size=5, freq="min") tbl = Table(data) assert tbl.size() == 5 assert tbl.schema() == { "index": datetime, "a": float, "b": float, "c": float, "d": float } assert tbl.view().to_dict()["index"] == [ datetime(2000, 1, 1, 0, 0), datetime(2000, 1, 1, 0, 1), datetime(2000, 1, 1, 0, 2), datetime(2000, 1, 1, 0, 3), datetime(2000, 1, 1, 0, 4) ]
def test_table_np_implicit_index(self): data = { "a": np.array(["a", "b", "c", "d", "e"]), "b": np.array([1, 2, 3, 4, 5]) } tbl = Table(data) assert tbl.size() == 5 assert tbl.schema() == { "a": str, "b": int } tbl.update({ "__INDEX__": np.array([1, 2, 3, 4]), "a": np.array(["bb", "cc", "dd", "ee"]) }) assert tbl.view().to_dict() == { "a": ["a", "bb", "cc", "dd", "ee"], "b": [1, 2, 3, 4, 5] }
def test_update_arrow_updates_less_columns_stream_file(self): tbl = Table({ "a": int, "x": float, }) with open(SOURCE_STREAM_ARROW, mode='rb') as file: # b is important -> binary tbl.update(file.read()) assert tbl.size() == 4 assert tbl.schema() == { "a": int, "x": float } with open(SOURCE_FILE_ARROW, mode='rb') as file: tbl.update(file.read()) assert tbl.size() == 8 assert tbl.view().to_dict() == { "a": [1, 2, 3, 4] * 2, "x": [None for i in range(8)] }
def test_object_referencecount_update_clear(self): t = CustomObjectStore(1) data = {"a": [t]} tbl = Table(data) assert tbl.schema() == {"a": object} assert tbl.size() == 1 assert tbl.view().to_dict() == {"a": [t]} # Count references # 1 for `t`, one for `data`, one for argument to sys.getrefcount, and one for the table assert sys.getrefcount(t) == 4 # do random number of updates count = randint(5, 10) for _ in range(count): tbl.update([data]) tbl.clear() assert tbl.size() == 0 assert tbl.view().to_dict() == {} # 1 for `t`, one for `data`, one for argument to sys.getrefcount assert sys.getrefcount(t) == 3
def test_update_arrow_arbitary_order(self, util): data = [[1, 2, 3, 4], ["a", "b", "c", "d"], [1, 2, 3, 4], ["a", "b", "c", "d"]] update_data = [[5, 6], ["e", "f"], [5, 6], ["e", "f"]] arrow = util.make_arrow(["a", "b", "c", "d"], data) update_arrow = util.make_arrow(["c", "b", "a", "d"], update_data) tbl = Table(arrow) assert tbl.schema() == { "a": int, "b": str, "c": int, "d": str } tbl.update(update_arrow) assert tbl.size() == 6 assert tbl.view().to_dict() == { "a": [1, 2, 3, 4, 5, 6], "b": ["a", "b", "c", "d", "e", "f"], "c": [1, 2, 3, 4, 5, 6], "d": ["a", "b", "c", "d", "e", "f"] }
def test_update_arrow_updates_stream_file(self): tbl = Table({ "a": int, "b": float, "c": str }) with open(SOURCE_STREAM_ARROW, mode='rb') as file: # b is important -> binary tbl.update(file.read()) assert tbl.size() == 4 assert tbl.schema() == { "a": int, "b": float, "c": str } with open(SOURCE_FILE_ARROW, mode='rb') as file: tbl.update(file.read()) assert tbl.size() == 8 assert tbl.view().to_dict() == { "a": [1, 2, 3, 4] * 2, "b": [1.5, 2.5, 3.5, 4.5] * 2, "c": ["a", "b", "c", "d"] * 2 }
def test_table_bool_infer_str_all_formats_from_schema(self): bool_data = [{ "a": "True", "b": "False" }, { "a": "t", "b": "f" }, { "a": "true", "b": "false" }, { "a": 1, "b": 0 }, { "a": "on", "b": "off" }] tbl = Table(bool_data) assert tbl.schema() == {"a": bool, "b": bool} assert tbl.size() == 5 assert tbl.view().to_dict() == { "a": [True, True, True, True, True], "b": [False, False, False, False, False] }
def test_table_np_promote(self): data = { "a": np.arange(5), "b": np.full(5, np.nan), "c": np.array([1, 2, 3, 2147483648, 5]) } tbl = Table({ "a": int, "b": float, "c": int }) tbl.update(data) assert tbl.size() == 5 assert tbl.schema() == { "a": int, "b": float, "c": int } assert tbl.view().to_dict() == { "a": [0, 1, 2, 3, 4], "b": [None, None, None, None, None], "c": [1.0, 2.0, 3.0, 2147483648.0, 5.0] }
def test_table_infer_datetime_edge(self): data = { "a": [None, None, None, None, None, None, "08/31/2019 00:00:01"] } tbl = Table(data) assert tbl.schema() == {"a": datetime}
def test_table_infer_ymd_date(self): data = {"a": [None, None, None, None, None, None, "2019/01/03"]} tbl = Table(data) assert tbl.schema() == {"a": date}
def test_table_infer_invalid_date(self): data = {"a": [None, None, None, None, None, None, "08/55/2019"]} tbl = Table(data) assert tbl.schema() == {"a": str}
def test_table_infer_date_from_date(self): # pass in a `date` to make sure it infers as date data = {"a": [None, None, None, None, None, None, date(2019, 7, 11)]} tbl = Table(data) assert tbl.schema() == {"a": date}
def test_table_infer_ambiguous_date(self): data = {"a": [None, None, None, None, None, None, "01/03/2019"]} tbl = Table(data) assert tbl.schema() == {"a": date}
def test_table_infer_bool(self): data = {"a": [None, None, None, None, True, True, True]} tbl = Table(data) assert tbl.schema() == {"a": bool}
def test_table_infer_str(self): data = {"a": [None, None, None, None, None, None, "abc"]} tbl = Table(data) assert tbl.schema() == {"a": str}
def test_table_infer_float(self): data = {"a": [None, None, None, None, 1.0, 2.0]} tbl = Table(data) assert tbl.schema() == {"a": float}
def test_table_datetime_infer_no_false_positive(self): data = {"a": [" . - / but clearly not a date"]} tbl = Table(data) assert tbl.schema() == {"a": str}
def test_table_strict_datetime_separator_infer(self): data = {"a": ["2019-10-01 7:30"]} tbl = Table(data) assert tbl.schema() == {"a": datetime}
def test_table_strict_date_infer(self): data = {"a": ["2019 09 10"]} tbl = Table(data) assert tbl.schema() == {"a": date}
def test_table_strict_datetime_infer(self): data = {"a": ['10', '9', '8', '7', '6', '5', '4', '3', '2', '1']} tbl = Table(data) assert tbl.schema() == {"a": str}
def test_table_recarray(self): d = np.array([(1.0, 2), (3.0, 4)], dtype=[('x', '<f8'), ('y', '<i8')]).view(np.recarray) table = Table(d) assert table.schema() == {"x": float, "y": int} assert table.view().to_dict() == {"x": [1.0, 3.0], "y": [2, 4]}
def test_table_infer_mixed_datetime(self): data = {"a": [None, None, None, None, None, "08/11/2019 13:14:15"]} tbl = Table(data) assert tbl.schema() == {"a": datetime}