def test_merge_schemas_simple(spark_session: SparkSession): df1 = spark_session.createDataFrame([ (0, "potato", "0.5", "100"), (1, "onion", "0.5", "150"), ], ["id", "product", "weight", "price"]) df2 = spark_session.createDataFrame([ (2, "CyberPunk2077", 1, "3000"), (3, "TENET", 1, "2000"), ], ["id", "product", "amount", "price"]) merger = SchemaMerging() result = merger.union(df1, df2) """ My result is slightly different: columns order [Row(id=0, product='potato', price='100', amount=None, weight='0.5'), Row(id=1, product='onion', price='150', amount=None, weight='0.5'), Row(id=2, product='CyberPunk2077', price='3000', amount=1, weight=None), Row(id=3, product='TENET', price='2000', amount=1, weight=None)] """ assert result.collect() == [ Row(id=0, product='potato', weight='0.5', price='100', amount=None), Row(id=1, product='onion', weight='0.5', price='150', amount=None), Row(id=2, product='CyberPunk2077', weight=None, price='3000', amount=1), Row(id=3, product='TENET', weight=None, price='2000', amount=1) ]
def test_merge_schemas_diff_types(spark_session: SparkSession): df1 = spark_session.createDataFrame([ (0, "potato", "0.5", 100), (1, "onion", "0.5", 150), ], ["id", "product", "weight", "price"]) df2 = spark_session.createDataFrame([ (2, "apple", "1", "300"), (3, "pineapple", "1", "200"), ], ["id", "product", "weight", "price"]) merger = SchemaMerging() result = merger.union(df1, df2) assert result.collect() == [ Row(id=0, product='potato', weight='0.5', price_bigint='100', price_string=None), Row(id=1, product='onion', weight='0.5', price_bigint='150', price_string=None), Row(id=2, product='apple', weight='1', price_bigint='300', price_string=None), Row(id=3, product='pineapple', weight='1', price_bigint='200', price_string=None) ]
def test_merge_schemas_no_common(spark_session: SparkSession): df1 = spark_session.createDataFrame([ ('uuid1', "honda", "50000"), ('uuid2', "toyota", "60000"), ], ["uuid", "car", "mileage"]) df2 = spark_session.createDataFrame([ (2, "apple", "1", "300"), (3, "pineapple", "1", "200"), ], ["id", "product", "weight", "price"]) merger = SchemaMerging() result = merger.union(df1, df2) """ My result is slightly different: columns order [Row(id=None, weight=None, product=None, price=None, uuid='uuid1', car='honda', mileage='50000'), Row(id=None, weight=None, product=None, price=None, uuid='uuid2', car='toyota', mileage='60000'), Row(id=2, weight='1', product='apple', price='300', uuid=None, car=None, mileage=None), Row(id=3, weight='1', product='pineapple', price='200', uuid=None, car=None, mileage=None)] """ assert result.collect() == [ Row(uuid='uuid1', car='honda', mileage='50000', id=None, product=None, weight=None, price=None), Row(uuid='uuid2', car='toyota', mileage='60000', id=None, product=None, weight=None, price=None), Row(uuid=None, car=None, mileage=None, id=2, product='apple', weight='1', price='300'), Row(uuid=None, car=None, mileage=None, id=3, product='pineapple', weight='1', price='200') ]
def test_merge_schemas_diff_types(spark_session: SparkSession): df1 = spark_session.createDataFrame([ (0, "potato", "0.5", 100), (1, "onion", "0.5", 150), ], ["id", "product", "weight", "price"]) df2 = spark_session.createDataFrame([ (2, "apple", "1", "300"), (3, "pineapple", "1", "200"), ], ["id", "product", "weight", "price"]) merger = SchemaMerging() result = merger.union(df1, df2) """ My result is slightly different: [Row(id=0, product='potato', weight='0.5', price_string=None, price_bigint=100), Row(id=1, product='onion', weight='0.5', price_string=None, price_bigint=150), Row(id=2, product='apple', weight='1', price_string='300', price_bigint=None), Row(id=3, product='pineapple', weight='1', price_string='200', price_bigint=None)] This is is more accurate in my opinion because it's not always possible to cast """ assert result.collect() == [ Row(id=0, product='potato', weight='0.5', price_bigint='100', price_string=None), Row(id=1, product='onion', weight='0.5', price_bigint='150', price_string=None), Row(id=2, product='apple', weight='1', price_bigint='300', price_string=None), Row(id=3, product='pineapple', weight='1', price_bigint='200', price_string=None) ]