def test_step_table_join_hash_is_true(): source = Resource("data/transform.csv") target = transform( source, steps=[ steps.table_normalize(), steps.table_join( resource=Resource( data=[["id", "note"], [1, "beer"], [2, "vine"]]), field_name="id", use_hash=True, ), ], ) assert target.schema == { "fields": [ { "name": "id", "type": "integer" }, { "name": "name", "type": "string" }, { "name": "population", "type": "integer" }, { "name": "note", "type": "string" }, ] } assert target.read_rows() == [ { "id": 1, "name": "germany", "population": 83, "note": "beer" }, { "id": 2, "name": "france", "population": 66, "note": "vine" }, ]
def test_step_table_join_mode_right(): source = Resource(path="data/transform.csv") target = transform( source, steps=[ steps.table_normalize(), steps.table_join( resource=Resource( data=[["id", "note"], [1, "beer"], [4, "rum"]]), field_name="id", mode="right", ), ], ) assert target.schema == { "fields": [ { "name": "id", "type": "integer" }, { "name": "name", "type": "string" }, { "name": "population", "type": "integer" }, { "name": "note", "type": "string" }, ] } assert target.read_rows() == [ { "id": 1, "name": "germany", "population": 83, "note": "beer" }, { "id": 4, "name": None, "population": None, "note": "rum" }, ]
def test_step_row_filter_petl_selectisinstance(): source = Resource(path="data/transform.csv") source.infer() target = transform( source, steps=[ steps.table_normalize(), steps.row_filter(function=lambda row: isinstance(row["id"], int)), ], ) assert target.schema == source.schema assert target.read_rows() == [ {"id": 1, "name": "germany", "population": 83}, {"id": 2, "name": "france", "population": 66}, {"id": 3, "name": "spain", "population": 47}, ]
def test_step_row_filter_petl_selectrangeopen(): source = Resource(path="data/transform.csv") source.infer() target = transform( source, steps=[ steps.table_normalize(), steps.row_filter(formula="1 <= id <= 3"), ], ) assert target.schema == source.schema assert target.read_rows() == [ {"id": 1, "name": "germany", "population": 83}, {"id": 2, "name": "france", "population": 66}, {"id": 3, "name": "spain", "population": 47}, ]
def test_step_table_intersect_with_use_hash(): source = Resource("data/transform.csv") target = transform( source, steps=[ steps.table_normalize(), steps.table_intersect( resource=Resource(data=[ ["id", "name", "population"], [1, "germany", 83], [2, "france", 50], [3, "spain", 47], ]), use_hash=True, ), ], ) assert target.schema == { "fields": [ { "name": "id", "type": "integer" }, { "name": "name", "type": "string" }, { "name": "population", "type": "integer" }, ] } assert target.read_rows() == [ { "id": 1, "name": "germany", "population": 83 }, { "id": 3, "name": "spain", "population": 47 }, ]
def test_step_row_filter_petl_selectgt(): source = Resource(path="data/transform.csv") source.infer(only_sample=True) target = transform( source, steps=[ steps.table_normalize(), steps.row_filter(predicat="<formula>id > 2"), ], ) assert target.schema == source.schema assert target.read_rows() == [ { "id": 3, "name": "spain", "population": 47 }, ]
def test_step_row_filter_petl_selectrangeclosed(): source = Resource(path="data/transform.csv") source.infer() target = transform( source, steps=[ steps.table_normalize(), steps.row_filter(predicat="<formula>1 < id < 3"), ], ) assert target.schema == source.schema assert target.read_rows() == [ { "id": 2, "name": "france", "population": 66 }, ]
def test_step_row_filter_petl_selecteq(): source = Resource(path="data/transform.csv") source.infer() target = transform( source, steps=[ steps.table_normalize(), steps.row_filter(predicat="<formula>id == 1"), ], ) assert target.schema == source.schema assert target.read_rows() == [ { "id": 1, "name": "germany", "population": 83 }, ]
def test_step_table_recast(): source = Resource("data/transform.csv") target = transform( source, steps=[ steps.table_normalize(), steps.table_melt(field_name="id"), steps.table_recast(field_name="id"), ], ) assert target.schema == { "fields": [ { "name": "id", "type": "integer" }, { "name": "name", "type": "string" }, { "name": "population", "type": "integer" }, ] } assert target.read_rows() == [ { "id": 1, "name": "germany", "population": 83 }, { "id": 2, "name": "france", "population": 66 }, { "id": 3, "name": "spain", "population": 47 }, ]
def test_step_table_melt_with_to_field_names(): source = Resource("data/transform.csv") target = transform( source, steps=[ steps.table_normalize(), steps.table_melt(field_name="name", variables=["population"], to_field_names=["key", "val"]), ], ) assert target.schema == { "fields": [ { "name": "name", "type": "string" }, { "name": "key" }, { "name": "val" }, ] } assert target.read_rows() == [ { "name": "germany", "key": "population", "val": 83 }, { "name": "france", "key": "population", "val": 66 }, { "name": "spain", "key": "population", "val": 47 }, ]
def test_step_row_filter_petl_selectisinstance(): source = Resource("data/transform.csv") target = transform( source, steps=[ steps.table_normalize(), steps.row_filter(function=lambda row: isinstance(row["id"], int)), ], ) assert target.schema == { "fields": [ { "name": "id", "type": "integer" }, { "name": "name", "type": "string" }, { "name": "population", "type": "integer" }, ] } assert target.read_rows() == [ { "id": 1, "name": "germany", "population": 83 }, { "id": 2, "name": "france", "population": 66 }, { "id": 3, "name": "spain", "population": 47 }, ]
def test_step_table_join_mode_anti(): source = Resource(path="data/transform.csv") target = transform( source, steps=[ steps.table_normalize(), steps.table_join( resource=Resource( data=[["id", "note"], [1, "beer"], [4, "rum"]]), mode="anti", ), ], ) assert target.schema == { "fields": [ { "name": "id", "type": "integer" }, { "name": "name", "type": "string" }, { "name": "population", "type": "integer" }, ] } assert target.read_rows() == [ { "id": 2, "name": "france", "population": 66 }, { "id": 3, "name": "spain", "population": 47 }, ]
def test_step_table_pivot(): source = Resource("data/transform-pivot.csv") target = transform( source, steps=[ steps.table_normalize(), steps.table_pivot(f1="region", f2="gender", f3="units", aggfun=sum), ], ) print(target.schema) assert target.schema == { "fields": [ {"name": "region", "type": "string"}, {"name": "boy", "type": "integer"}, {"name": "girl", "type": "integer"}, ] } assert target.read_rows() == [ {"region": "east", "boy": 33, "girl": 29}, {"region": "west", "boy": 35, "girl": 23}, ]
def test_step_field_add_with_formula(): source = Resource(path="data/transform.csv") target = transform( source, steps=[ steps.table_normalize(), steps.field_add(name="calc", formula="id * 100 + population"), ], ) assert target.schema == { "fields": [ {"name": "id", "type": "integer"}, {"name": "name", "type": "string"}, {"name": "population", "type": "integer"}, {"name": "calc"}, ] } assert target.read_rows() == [ {"id": 1, "name": "germany", "population": 83, "calc": 183}, {"id": 2, "name": "france", "population": 66, "calc": 266}, {"id": 3, "name": "spain", "population": 47, "calc": 347}, ]
def test_step_row_filter_petl_selectrangeopenleft(): source = Resource(path="data/transform.csv") source.infer(only_sample=True) target = transform( source, steps=[ steps.table_normalize(), steps.row_filter(predicat="<formula>1 <= id < 3"), ], ) assert target.schema == source.schema assert target.read_rows() == [ { "id": 1, "name": "germany", "population": 83 }, { "id": 2, "name": "france", "population": 66 }, ]
def test_step_table_diff(): source = Resource(path="data/transform.csv") source.infer() target = transform( source, steps=[ steps.table_normalize(), steps.table_diff(resource=Resource(data=[ ["id", "name", "population"], [1, "germany", 83], [2, "france", 50], [3, "spain", 47], ])), ], ) assert target.schema == source.schema assert target.read_rows() == [ { "id": 2, "name": "france", "population": 66 }, ]
def test_step_row_filter(): source = Resource(path="data/transform.csv") source.infer() target = transform( source, steps=[ steps.table_normalize(), steps.row_filter(predicat="<formula>id > 1"), ], ) assert target.schema == source.schema assert target.read_rows() == [ { "id": 2, "name": "france", "population": 66 }, { "id": 3, "name": "spain", "population": 47 }, ]
def test_step_row_filter_with_callable_predicat(): source = Resource(path="data/transform.csv") source.infer(only_sample=True) target = transform( source, steps=[ steps.table_normalize(), steps.row_filter(predicat=lambda row: row["id"] > 1), ], ) assert target.schema == source.schema assert target.read_rows() == [ { "id": 2, "name": "france", "population": 66 }, { "id": 3, "name": "spain", "population": 47 }, ]
def test_step_table_diff(): source = Resource("data/transform.csv") target = transform( source, steps=[ steps.table_normalize(), steps.table_diff(resource=Resource(data=[ ["id", "name", "population"], [1, "germany", 83], [2, "france", 50], [3, "spain", 47], ])), ], ) assert target.schema == { "fields": [ { "name": "id", "type": "integer" }, { "name": "name", "type": "string" }, { "name": "population", "type": "integer" }, ] } assert target.read_rows() == [ { "id": 2, "name": "france", "population": 66 }, ]
def test_step_row_filter_petl_selectisnot(): source = Resource("data/transform.csv") target = transform( source, steps=[ steps.table_normalize(), steps.row_filter(formula="id is not 1"), ], ) assert target.schema == { "fields": [ { "name": "id", "type": "integer" }, { "name": "name", "type": "string" }, { "name": "population", "type": "integer" }, ] } assert target.read_rows() == [ { "id": 2, "name": "france", "population": 66 }, { "id": 3, "name": "spain", "population": 47 }, ]