def test_join_exact_matching(self): fbi_data1 = pandas.DataFrame( { 'index': ['california_sunnyvale', 'alabama_montgomery'], 'foo': [1, 2], 'State': ['CA', 'AL'], 'City': ['Sunnyvale', 'Montgomery'], 'Population': [100, 200], }, index=[0, 1]) fbi_data2 = pandas.DataFrame( { 'index': ['alabama_montgomery', 'california_sunnyvale'], 'bar': [3, 4], 'State': ['AL', 'CA'], 'City': ['Montgomery', 'Sunnyvale'], 'Population': [200, 100], }, index=[0, 1]) fbi_table1 = fbi_data_table(data=fbi_data1, suffix='_table1') fbi_table2 = fbi_data_table(data=fbi_data2, suffix='_table2') # Note that the indices, # index=['california_sunnyvale', 'alabama_montgomery'] # will be ignored. expected_data = pandas.DataFrame({ 'foo': [1, 2], 'bar': [4, 3], 'State_table1': ['CA', 'AL'], 'City_table1': ['Sunnyvale', 'Montgomery'], 'Population_table1': [100, 200], 'State_table2': ['CA', 'AL'], 'City_table2': ['Sunnyvale', 'Montgomery'], 'Population_table2': [100, 200], 'index': ['california_sunnyvale', 'alabama_montgomery'] }).sort_index(axis=1) actual_data = fbi_table1.join_exact_matching(fbi_table2).data.sort_index( axis=1) # We sort the pandas DataFrame columns in order to compare self.assertTrue(expected_data.equals(actual_data))
def test_init_from_data(self): # Test initializing an `Fbi` DataTable from pandas dataframe. df = pandas.DataFrame( { 'foo': 1, 'State': 'CA', 'City': 'Sunnyvale', 'Population': 100, 'state': 'ignored', 'city': 'ignored' }, index=[0]) fbi_table = fbi_data_table(data=df) self.assertTrue(fbi_table.data.equals(df))
def test_get_fuzzy_matching_key(self): df = pandas.DataFrame( { 'foo': 1, 'state': 'CA', 'city': 'Sunnyvale', 'population': 100, 'State': 'ignored', 'City': 'ignored' }, index=[0]) fbi_table = fbi_data_table(data=df) self.assertEqual( fbi_table.get_fuzzy_matching_key(df.iloc[0]), FuzzyMatchingKey(state='CA', city='Sunnyvale', population=100))
def test_join_fuzzy_matching(self): fbi_data1 = pandas.DataFrame( { 'foo': [1, 2, 3], 'state': ['CA', 'AL', 'Hidden'], 'city': ['Sunnyvale', 'Montgomery', 'Lost City'], 'population': [100, 200, 300], 'index': ['california_sunnyvale', 'alabama_montgomery', 'atlantas_lost_city'], }, index=['california_sunnyvale', 'alabama_montgomery', 'atlantis']) census_data2 = pandas.DataFrame( { 'bar': [5, 3, 4], 'state': ['Isle of Man', 'AL', 'CA'], 'city': ['Avalon', 'Montgomery', 'Sunnyvale'], 'Target Geo Id2': ['???', '1620000US0151000', '1620000US0677000'], get_header('Population Estimate (as of July 1) - 2017', 'census_2017'): [1, 200, 100], }, index=[0, 1, 2]) fbi_table1 = fbi_data_table(data=fbi_data1, suffix='_fbi') census_table2 = census_data_table(data=census_data2, suffix='_census') joined_table = fbi_table1.join_fuzzy_matching(census_table2) # The output table from matching should be the same class as the left table. self.assertTrue(isinstance(joined_table, fbi_data_table)) actual_data = joined_table.data.sort_index(axis=1) expected_data = pandas.DataFrame({ 'city_fbi': ['Montgomery', 'Sunnyvale'], 'population': [200, 100], get_header('Population Estimate (as of July 1) - 2017', 'census_2017'): [200, 100], 'state_census': ['AL', 'CA'], 'Target Geo Id2': ['1620000US0151000', '1620000US0677000'], 'bar': [3, 4], 'foo': [2, 1], 'city_census': ['Montgomery', 'Sunnyvale'], 'index': ['alabama_montgomery', 'california_sunnyvale'], 'state_fbi': ['AL', 'CA'], }).sort_index(axis=1) self.assertTrue(expected_data.equals(actual_data))
def test_init_from_file(self): fbi_table = fbi_data_table( file_path= 'data/fbi/Table_8_Offenses_Known_to_Law_Enforcement_by_State_by_City_2017.xls' ) self.assertEqual(len(fbi_table.data), 9589)