def test_delete_correct_rows_from_parquet_table_with_both_simple_and_composite_types( ): data = { "customer_id": [12345, 23456, 34567], "first_name": ["john", "jane", "matteo"], "last_name": ["doe", "doe", "hey"], } columns = [ { "Column": "customer_id", "MatchIds": [12345], "Type": "Simple" }, { "Columns": ["first_name", "last_name"], "MatchIds": [["jane", "doe"]], "Type": "Composite", }, ] df = pd.DataFrame(data) table = pa.Table.from_pandas(df) table, deleted_rows = delete_from_table(table, columns) res = table.to_pandas() assert len(res) == 1 assert deleted_rows == 2 assert res["customer_id"].values[0] == 34567
def test_delete_correct_rows_from_parquet_table_with_complex_composite_types(): data = { "customer_id": [12345, 23456, 34567], "details": [ { "first_name": "John", "last_name": "Doe" }, { "first_name": "Jane", "last_name": "Doe" }, { "first_name": "Matteo", "last_name": "Hey" }, ], } columns = [{ "Columns": ["details.first_name", "details.last_name"], "MatchIds": [["John", "Doe"], ["Jane", "Doe"], ["Matteo", "Doe"]], "Type": "Composite", }] df = pd.DataFrame(data) table = pa.Table.from_pandas(df) table, deleted_rows = delete_from_table(table, columns) res = table.to_pandas() assert len(res) == 1 assert deleted_rows == 2 assert res["customer_id"].values[0] == 34567
def test_it_handles_data_with_pandas_indexes(): data = [ { "customer_id": "12345" }, { "customer_id": "23456" }, { "customer_id": "34567" }, ] columns = [{ "Column": "customer_id", "MatchIds": ["12345", "23456"], "Type": "Simple" }] df = pd.DataFrame(data, list("abc")) table = pa.Table.from_pandas(df) table, deleted_rows = delete_from_table(table, columns) res = table.to_pandas() assert len(res) == 1 assert deleted_rows == 2 assert table.to_pydict() == { "customer_id": ["34567"], "__index_level_0__": ["c"] }
def test_handles_lower_cased_column_names(): data = [ { "userData": { "customerId": "12345" } }, { "userData": { "customerId": "23456" } }, { "userData": { "customerId": "34567" } }, ] columns = [{ "Column": "userdata.customerid", "MatchIds": ["12345", "23456"], "Type": "Simple", }] df = pd.DataFrame(data) table = pa.Table.from_pandas(df) table, deleted_rows = delete_from_table(table, columns) res = table.to_pandas() assert len(res) == 1 assert deleted_rows == 2 assert table.to_pydict() == {"userData": [{"customerId": "34567"}]}
def test_delete_correct_rows_from_parquet_table_with_complex_types(): data = { "customer_id": [12345, 23456, 34567], "user_info": [ { "name": "matteo", "email": "*****@*****.**" }, { "name": "nick", "email": "*****@*****.**" }, { "name": "chris", "email": "*****@*****.**" }, ], } columns = [{"Column": "user_info.name", "MatchIds": ["matteo", "chris"]}] df = pd.DataFrame(data) table = pa.Table.from_pandas(df) schema = pa.Schema.from_pandas(df) table, deleted_rows = delete_from_table(table, columns, schema) res = table.to_pandas() assert len(res) == 1 assert deleted_rows == 2 assert res["customer_id"].values[0] == 23456 # user_info is saved unflattened preserving original schema: assert res["user_info"].values[0] == { "name": "nick", "email": "*****@*****.**" }
def test_delete_correct_rows_from_table(): data = [ { "customer_id": "12345" }, { "customer_id": "23456" }, { "customer_id": "34567" }, ] columns = [{"Column": "customer_id", "MatchIds": ["12345", "23456"]}] df = pd.DataFrame(data) table = pa.Table.from_pandas(df) schema = pa.Schema.from_pandas(df) table, deleted_rows = delete_from_table(table, columns, schema) res = table.to_pandas() assert len(res) == 1 assert deleted_rows == 2 assert table.to_pydict() == {"customer_id": ["34567"]}