def test_download_and_load_pums_data_download(self, mock_fetch_pums_data, mock_exists, mock_CleanedData, mock_pandas_to_csv): '''Verify fetch_pums_data is called with the proper arguments if local pums files aren't found ''' mock_fetch_pums_data.return_value = (CleanedData(pandas.DataFrame()), CleanedData(pandas.DataFrame())) configuration = self._mock_config() mock_exists.return_value = False download_allocate_generate.download_and_load_pums_data( output_dir=self._mock_params['output_dir'], state_id=self._mock_params['state_id'], puma_id=self._mock_params['puma_id'], configuration=configuration, db_host=self._mock_params['db_host'], db_database=self._mock_params['db_database'], db_schema=self._mock_params['db_schema'], db_user=self._mock_params['db_user'], db_password=self._mock_params['db_password']) mock_fetch_pums_data.assert_called() mock_fetch_pums_data.assert_called_with(state_id='01', puma_id='00001', configuration=configuration, db_host='host1', db_database='database1', db_schema='schema1', db_user='******', db_password='******')
def download_and_load_pums_data(output_dir, state_id, puma_id, configuration, db_host, db_database, db_schema, db_user, db_password): '''Do the pums files already exist -- if no - read from db, write csv; load the csv if yes - load csv file Args: output_dir: place to look for and write pums household and person data files state_id: 2-digit state fips code puma_id: 5-digit puma code configuration: keeps track of which variables/models belong to households and persons db_host: hostname of the POSTGRESQL instance to connect to db_database: database name to connect to db_schema: postgres schema name schema which _must_ contain a persons and households table with pums fields referenced in doppelganger/inputs.py. E.g. if your schema is called "pums", then your schema should have a "pums.persons" table and a "pums.households" table db_user: username to connect with db_password: password to authenticate to the database Returns: Household and Person dataframes with the pums fields specified above. ''' household_filename = FILE_PATTERN.format(state_id, puma_id, 'households_pums.csv') household_path = os.path.sep.join([output_dir, household_filename]) person_filename = FILE_PATTERN.format(state_id, puma_id, 'persons_pums.csv') person_path = os.path.sep.join([output_dir, person_filename]) if not os.path.exists(household_path) or not os.path.exists(person_path): logging.info( 'Data not found at:\n%s\nor\n%s\n Downloading data from the db', household_path, person_path) households_data, persons_data = fetch_pums_data( state_id=state_id, puma_id=puma_id, configuration=configuration, db_host=db_host, db_database=db_database, db_schema=db_schema, db_user=db_user, db_password=db_password, ) # Write data to files, so mustn't be downloaded again households_data.data.to_csv(household_path) persons_data.data.to_csv(person_path) else: households_data = CleanedData.from_csv(household_path) persons_data = CleanedData.from_csv(person_path) return households_data, persons_data
def test_from_cleaned_data(self): # Prepare pums data households_data = CleanedData(self._mock_household_data()) persons_data = CleanedData(self._mock_person_data()) # Prepare marginals marginals = Marginals(self._mock_tract_data()) allocator = HouseholdAllocator.from_cleaned_data( marginals, households_data, persons_data) self.assertTrue(allocator) expected_shape = (114, 17) self.assertEqual(allocator.allocated_households.shape, expected_shape) expected_columns = [ u'serial_number', u'num_people', u'num_vehicles', u'household_weight', u'num_people_1', u'num_people_2', u'num_people_3', u'num_vehicles_0', u'num_vehicles_1', u'num_vehicles_2', u'num_vehicles_3+', u'age_0-17', u'age_18-34', u'age_65+', u'age_35-64', u'count', u'tract' ] self.assertEqual(set(allocator.allocated_households.columns.tolist()), set(expected_columns))