def test_merge_tables_duplicate_column_names(): """ Confirm tables can be merged with overlapping column names, as long as they're not included in the list of columns to retain. """ d = { 'building_id': [1, 2, 3, 4], 'value': [4, 4, 4, 4], 'dupe': [1, 1, 1, 1] } buildings = pd.DataFrame(d).set_index('building_id') d = { 'household_id': [1, 2, 3], 'building_id': [2, 3, 4], 'dupe': [1, 1, 1] } households = pd.DataFrame(d).set_index('household_id') # Duplicate columns should raise a ValueError try: merged = merge_tables([households, buildings]) pytest.fail() except ValueError as e: print(e) # Excluding the duplicated name should make things ok merged = merge_tables([households, buildings], columns=['value']) assert sorted(all_cols(merged)) == sorted(['household_id', 'value'])
def test_merge_two_tables(): """ Merge two tables. """ d = {'building_id': [1, 2, 3, 4], 'value': [4, 4, 4, 4]} buildings = pd.DataFrame(d).set_index('building_id') d = {'household_id': [1, 2, 3], 'building_id': [2, 3, 4]} households = pd.DataFrame(d).set_index('household_id') merged = merge_tables([households, buildings]) assert sorted(all_cols(merged)) == sorted( ['household_id', 'building_id', 'value'])
def test_merge_three_tables_out_of_order(): """ Merge three tables, where the second and third are each merged onto the first. """ d = {'zone_id': [1], 'size': [1]} zones = pd.DataFrame(d).set_index('zone_id') d = {'building_id': [1, 2, 3, 4], 'height': [4, 4, 4, 4]} buildings = pd.DataFrame(d).set_index('building_id') d = { 'household_id': [1, 2, 3], 'building_id': [2, 3, 4], 'zone_id': [1, 1, 1] } households = pd.DataFrame(d).set_index('household_id') merged = merge_tables([households, buildings, zones]) assert sorted(all_cols(merged)) == sorted( ['household_id', 'building_id', 'zone_id', 'height', 'size'])
def test_merge_three_tables(): """ Merge three tables. """ d = {'zone_id': [1], 'size': [1]} zones = pd.DataFrame(d).set_index('zone_id') d = { 'building_id': [1, 2, 3, 4], 'zone_id': [1, 1, 1, 1], 'height': [4, 4, 4, 4] } buildings = pd.DataFrame(d).set_index('building_id') d = {'household_id': [1, 2, 3], 'building_id': [2, 3, 4]} households = pd.DataFrame(d).set_index('household_id') merged = merge_tables([households, buildings, zones]) assert sorted(all_cols(merged)) == sorted( ['household_id', 'building_id', 'zone_id', 'height', 'size'])
def test_merge_tables_multiindex(): """ Merge tables where the source table has a multi-index. """ d = { 'building_id': [1, 1, 2, 2], 'unit_id': [1, 2, 1, 2], 'value': [4, 4, 4, 4] } units = pd.DataFrame(d).set_index(['building_id', 'unit_id']) d = { 'household_id': [1, 2, 3], 'building_id': [1, 1, 2], 'unit_id': [1, 2, 1] } households = pd.DataFrame(d).set_index('household_id') merged = merge_tables([households, units]) assert sorted(all_cols(merged)) == sorted( ['household_id', 'building_id', 'unit_id', 'value'])
def test_merge_tables_missing_values(): """ If the target table includes identifiers not found in the source table, missing values should be inserted, changing the data type. """ d = { 'building_id': [1, 1, 2, 2], 'unit_id': [1, 2, 1, 2], 'value': [4, 4, 4, 4] } units = pd.DataFrame(d).set_index(['building_id', 'unit_id']) d = { 'household_id': [1, 2, 3], 'building_id': [1, 1, 3], 'unit_id': [1, 2, 1] } households = pd.DataFrame(d).set_index('household_id') merged = merge_tables([households, units]) assert units.value.dtype == 'int64' assert merged.values.dtype == 'float64'