def testWrite_CalculatesSum(self): # Act for table, df in self.parsed_pdf.items(): dao.write_df(table, df) # Assert query = SessionFactory.for_schema_base(JailsBase).query( func.sum(NyFacilityAggregate.in_house)) result = one(one(query.all())) expected_sum_in_house = 189012 self.assertEqual(result, expected_sum_in_house)
def testWrite_CalculatesCountyPopulationSum(self): # Act for table, df in self.parsed_pdf.items(): dao.write_df(table, df) # Assert query = SessionFactory.for_schema_base(JailsBase).query( func.sum(FlCountyAggregate.county_population)) result = one(one(query.all())) expected_sum_county_populations = 20148654 self.assertEqual(result, expected_sum_county_populations)
def testWrite_CalculatesSum(self): # Act for table, df in self.parsed_pdf.items(): dao.write_df(table, df) # Assert query = SessionFactory.for_schema_base(JailsBase).query( func.sum(HiFacilityAggregate.total_population)) result = one(one(query.all())) expected_sum_total_population = 5241 self.assertEqual(result, expected_sum_total_population)
def testWrite_CalculatesSum(self): # Act for table, df in PARSED_RESULT.items(): dao.write_df(table, df) # Assert query = SessionFactory.for_schema_base(JailsBase).query( func.sum(CaFacilityAggregate.average_daily_population)) result = one(one(query.all())) expected_sum_adp = 900124 self.assertEqual(result, expected_sum_adp)
def testWrite_CalculatesSum(self): # Act for table, df in self.parsed_pdf.items(): dao.write_df(table, df) # Assert query = SessionFactory.for_schema_base(JailsBase).query( func.sum(GaCountyAggregate.total_number_of_inmates_in_jail)) result = one(one(query.all())) expected_sum_county_populations = 37697 self.assertEqual(result, expected_sum_county_populations)
def testWrite_Table2_CalculateSum(self): # Act for table, df in PARSED_RESULT.items(): dao.write_df(table, df) # Assert query = SessionFactory.for_schema_base(JailsBase).query( func.sum(PaCountyPreSentencedAggregate.pre_sentenced_population)) result = one(one(query.all())) expected_pretrial_population = 82521 self.assertEqual(result, expected_pretrial_population)
def testWrite_CalculatesSum_before_1996(self): for table, df in self.parsed_pdf_before_1996.items(): dao.write_df(table, df) # Assert query = SessionFactory.for_schema_base(JailsBase).query( func.sum(TxCountyAggregate.pretrial_felons)) result = one(one(query.all())) expected_pretrial_felons = 14727 self.assertEqual(result, expected_pretrial_felons)
def testWrite_CalculatesFacilityAdpSum(self): # Act for table, df in self.parsed_pdf.items(): dao.write_df(table, df) # Assert query = SessionFactory.for_schema_base(JailsBase).query( func.sum(FlFacilityAggregate.average_daily_population)) result = one(one(query.all())) expected_sum_facility_adp = 52388 self.assertEqual(result, expected_sum_facility_adp)
def testWriteDf_doesNotOverrideMatchingColumnNames(self): # Arrange subject = pd.DataFrame({ "county_name": ["Alachua", "Baker", "Bay", "Bradford", "Brevard"], "county_population": [257062, 26965, 176016, 27440, 568919], "average_daily_population": [799, 478, 1015, 141, 1547], "date_reported": [ pd.NaT, pd.NaT, datetime.datetime(year=2017, month=9, day=1), pd.NaT, pd.NaT, ], "fips": ["00000", "00001", "00002", "00003", "00004"], "report_date": 5 * [DATE_SCRAPED], "aggregation_window": 5 * [enum_strings.monthly_granularity], "report_frequency": 5 * [enum_strings.monthly_granularity], }) dao.write_df(FlCountyAggregate, subject) subject = pd.DataFrame({ "facility_name": ["One", "Two", "Three", "Four", "Five"], "average_daily_population": [13, 14, 15, 16, 17], "number_felony_pretrial": [23, 24, 25, 26, 27], "number_misdemeanor_pretrial": 5 * [pd.NaT], "fips": ["10000", "10111", "10222", "10333", "10444"], "report_date": 5 * [DATE_SCRAPED], "aggregation_window": 5 * [enum_strings.monthly_granularity], "report_frequency": 5 * [enum_strings.monthly_granularity], }) # Act dao.write_df(FlFacilityAggregate, subject) # Assert with SessionFactory.using_database(self.database_key, autocommit=False) as session: query = session.query(FlCountyAggregate).filter( FlCountyAggregate.county_name == "Bay") result = one(query.all()) fips_not_overridden_by_facility_table = "00002" self.assertEqual(result.county_name, "Bay") self.assertEqual(result.fips, fips_not_overridden_by_facility_table)
def testWrite_CalculatesSum(self) -> None: # Act for table, df in self.parsed_csv.items(): dao.write_df(table, df) # Assert with SessionFactory.using_database(self.database_key) as session: query = session.query(func.sum( MaFacilityAggregate.jail_total_male)) result = one(one(query.all())) expected_sum_male = 12366 self.assertEqual(result, expected_sum_male)
def testWrite_Table1_CalculatesSums(self): # Act for table, df in PARSED_RESULT.items(): dao.write_df(table, df) # Assert query = SessionFactory.for_schema_base(JailsBase).query( func.sum(PaFacilityPopAggregate.housed_elsewhere_adp)) result = one(one(query.all())) # Note: This report contains fractional averages expected_housed_elsewhere_adp = 1564.0257 self.assertEqual(result, expected_housed_elsewhere_adp)
def testWriteDf_OverlappingData_WritesNewAndIgnoresDuplicateRows(self): # Arrange initial_df = pd.DataFrame({ "county_name": ["Alachua", "Baker", "Bay", "Bradford", "Brevard"], "county_population": [257062, 26965, 176016, 27440, 568919], "average_daily_population": [799, 478, 1015, 141, 1547], "date_reported": [ pd.NaT, pd.NaT, datetime.datetime(year=2017, month=9, day=1), pd.NaT, pd.NaT, ], "fips": ["00000", "00001", "00002", "00003", "00004"], "report_date": 5 * [DATE_SCRAPED], "aggregation_window": 5 * [enum_strings.monthly_granularity], "report_frequency": 5 * [enum_strings.monthly_granularity], }) dao.write_df(FlCountyAggregate, initial_df) subject = pd.DataFrame({ "county_name": ["Alachua", "NewCounty", "Baker"], "county_population": [0, 1000000000, 0], "average_daily_population": [0, 50, 0], "date_reported": [pd.NaT, pd.NaT, pd.NaT], "fips": ["00000", "01000", "00002"], "report_date": 3 * [DATE_SCRAPED], "aggregation_window": 3 * [enum_strings.monthly_granularity], "report_frequency": 3 * [enum_strings.monthly_granularity], }) # Act dao.write_df(FlCountyAggregate, subject) # Assert with SessionFactory.using_database(self.database_key, autocommit=False) as session: query = session.query(func.sum( FlCountyAggregate.county_population)) result = one(one(query.all())) # This sum includes intial_df + NewCounty and ignores other changes in # the subject (eg. county_population = 0 for 'Alachua') expected_sum_county_populations = 1001056402 self.assertEqual(result, expected_sum_county_populations)
def testWrite_CalculatesSum(self) -> None: # Act for table, df in self.parsed_csv.items(): dao.write_df(table, df) # Assert with SessionFactory.using_database(self.database_key, autocommit=False) as session: query = session.query( func.sum(CoFacilityAggregate.male_number_of_inmates)) result = one(one(query.all())) expected_sum_male = 45933 self.assertEqual(result, expected_sum_male)
def testWrite_CalculatesSum_1996(self) -> None: if not self.parsed_pdf_1996: raise ValueError("Unexpectedly empty parsed_pdf_1996") for table, df in self.parsed_pdf_1996.items(): dao.write_df(table, df) # Assert with SessionFactory.using_database(self.database_key, autocommit=False) as session: query = session.query(func.sum(TxCountyAggregate.pretrial_felons)) result = one(one(query.all())) expected_pretrial_felons = 14140 self.assertEqual(result, expected_pretrial_felons)
def testWrite_CalculatesSum_Concat(self) -> None: if not self.parsed_pdf_concat: raise ValueError("Unexpectedly empty parsed_pdf_concat") for table, df in self.parsed_pdf_concat.items(): dao.write_df(table, df) # Assert with SessionFactory.using_database(self.database_key, autocommit=False) as session: query = session.query(func.sum(TxCountyAggregate.available_beds)) result = one(one(query.all())) expected_sum_available_beds = 7044 self.assertEqual(result, expected_sum_available_beds)
def testWrite_CalculatesSum(self) -> None: # Act for table, df in self.parsed_csv.items(): dao.write_df(table, df) # Assert with SessionFactory.using_database(self.database_key, autocommit=False) as session: query = session.query( func.sum(WvFacilityAggregate.total_jail_population)) result = one(one(query.all())) expected_sum = 88 self.assertEqual(result, expected_sum)
def testWrite_CalculatesSum(self) -> None: # Act for table, df in self.parsed_excel.items(): dao.write_df(table, df) # Assert with SessionFactory.using_database(self.database_key, autocommit=False) as session: query = session.query( func.sum(InCountyAggregate.total_jail_population)) result = one(one(query.all())) # This is the expected sum, even though the excel file has a different sum. expected_sum = 17164 self.assertEqual(result, expected_sum)
def testWrite_Table2_CalculateSum(self) -> None: # Act for table, df in _parsed_result().items(): dao.write_df(table, df) # Assert with SessionFactory.using_database(self.database_key, autocommit=False) as session: query = session.query( func.sum( PaCountyPreSentencedAggregate.pre_sentenced_population)) result = one(one(query.all())) expected_pretrial_population = 82521 self.assertEqual(result, expected_pretrial_population)
def testWrite_Table1_CalculatesSums(self) -> None: # Act for table, df in _parsed_result().items(): dao.write_df(table, df) # Assert with SessionFactory.using_database(self.database_key, autocommit=False) as session: query = session.query( func.sum(PaFacilityPopAggregate.housed_elsewhere_adp)) result = one(one(query.all())) # Note: This report contains fractional averages expected_housed_elsewhere_adp = 1564.0257 self.assertEqual(result, expected_housed_elsewhere_adp)
def testWrite_CalculatesFacilityAdpSum(self) -> None: if not self.parsed_pdf: raise ValueError("Unexpectedly empty parsed_pdf") # Act for table, df in self.parsed_pdf.items(): dao.write_df(table, df) # Assert with SessionFactory.using_database(self.database_key) as session: query = session.query( func.sum(FlFacilityAggregate.average_daily_population)) result = one(one(query.all())) expected_sum_facility_adp = 52388 self.assertEqual(result, expected_sum_facility_adp)
def testWrite_CalculatesCountyPopulationSum(self) -> None: if not self.parsed_pdf: raise ValueError("Unexpectedly empty parsed_pdf") # Act for table, df in self.parsed_pdf.items(): dao.write_df(table, df) # Assert with SessionFactory.using_database(self.database_key) as session: query = session.query(func.sum( FlCountyAggregate.county_population)) result = one(one(query.all())) expected_sum_county_populations = 20148654 self.assertEqual(result, expected_sum_county_populations)
def testWriteDf_doesNotOverrideMatchingColumnNames(self): # Arrange subject = pd.DataFrame({ 'county_name': ['Alachua', 'Baker', 'Bay', 'Bradford', 'Brevard'], 'county_population': [257062, 26965, 176016, 27440, 568919], 'average_daily_population': [799, 478, 1015, 141, 1547], 'date_reported': [ pd.NaT, pd.NaT, datetime.datetime(year=2017, month=9, day=1), pd.NaT, pd.NaT ], 'fips': ['00000', '00001', '00002', '00003', '00004'], 'report_date': 5 * [DATE_SCRAPED], 'aggregation_window': 5 * [enum_strings.monthly_granularity], 'report_frequency': 5 * [enum_strings.monthly_granularity] }) dao.write_df(FlCountyAggregate, subject) subject = pd.DataFrame({ 'facility_name': ['One', 'Two', 'Three', 'Four', 'Five'], 'average_daily_population': [13, 14, 15, 16, 17], 'number_felony_pretrial': [23, 24, 25, 26, 27], 'number_misdemeanor_pretrial': 5 * [pd.NaT], 'fips': ['10000', '10111', '10222', '10333', '10444'], 'report_date': 5 * [DATE_SCRAPED], 'aggregation_window': 5 * [enum_strings.monthly_granularity], 'report_frequency': 5 * [enum_strings.monthly_granularity] }) # Act dao.write_df(FlFacilityAggregate, subject) # Assert query = SessionFactory.for_schema_base(JailsBase) \ .query(FlCountyAggregate) \ .filter(FlCountyAggregate.county_name == 'Bay') result = one(query.all()) fips_not_overridden_by_facility_table = '00002' self.assertEqual(result.county_name, 'Bay') self.assertEqual(result.fips, fips_not_overridden_by_facility_table)
def testWrite_CalculatesSum(self) -> None: # Act parsed_result = _parsed_result() for table, df in parsed_result.items(): dao.write_df(table, df) # Assert with SessionFactory.using_database( self.database_key, autocommit=False ) as session: query = session.query( func.sum(CaFacilityAggregate.average_daily_population) ) result = one(one(query.all())) expected_sum_adp = 900124 self.assertEqual(result, expected_sum_adp)
def testWriteDf_OverlappingData_WritesNewAndIgnoresDuplicateRows(self): # Arrange initial_df = pd.DataFrame({ 'county_name': ['Alachua', 'Baker', 'Bay', 'Bradford', 'Brevard'], 'county_population': [257062, 26965, 176016, 27440, 568919], 'average_daily_population': [799, 478, 1015, 141, 1547], 'date_reported': [ pd.NaT, pd.NaT, datetime.datetime(year=2017, month=9, day=1), pd.NaT, pd.NaT ], 'fips': ['00000', '00001', '00002', '00003', '00004'], 'report_date': 5 * [DATE_SCRAPED], 'aggregation_window': 5 * [enum_strings.monthly_granularity], 'report_frequency': 5 * [enum_strings.monthly_granularity] }) dao.write_df(FlCountyAggregate, initial_df) subject = pd.DataFrame({ 'county_name': ['Alachua', 'NewCounty', 'Baker'], 'county_population': [0, 1000000000, 0], 'average_daily_population': [0, 50, 0], 'date_reported': [pd.NaT, pd.NaT, pd.NaT], 'fips': ['00000', '01000', '00002'], 'report_date': 3 * [DATE_SCRAPED], 'aggregation_window': 3 * [enum_strings.monthly_granularity], 'report_frequency': 3 * [enum_strings.monthly_granularity] }) # Act dao.write_df(FlCountyAggregate, subject) # Assert query = SessionFactory.for_schema_base(JailsBase).query( func.sum(FlCountyAggregate.county_population)) result = one(one(query.all())) # This sum includes intial_df + NewCounty and ignores other changes in # the subject (eg. county_population = 0 for 'Alachua') expected_sum_county_populations = 1001056402 self.assertEqual(result, expected_sum_county_populations)
def testWrite_CalculatesSum(self) -> None: if not self.parsed_pdf: raise ValueError("Unexpectedly empty parsed_pdf") # Act for table, df in self.parsed_pdf.items(): dao.write_df(table, df) # Assert with SessionFactory.using_database(self.database_key, autocommit=False) as session: query = session.query( func.sum(HiFacilityAggregate.total_population)) result = one(one(query.all())) expected_sum_total_population = 5241 self.assertEqual(result, expected_sum_total_population)
def testWrite_CorrectlyReadsHernandoCounty(self): # Act for table, df in self.parsed_pdf.items(): dao.write_df(table, df) # Assert query = SessionFactory.for_schema_base(JailsBase) \ .query(FlCountyAggregate) \ .filter(FlCountyAggregate.county_name == 'Hernando') hernando_row = one(query.all()) self.assertEqual(hernando_row.county_name, 'Hernando') self.assertEqual(hernando_row.county_population, 179503) self.assertEqual(hernando_row.average_daily_population, 632) self.assertEqual(hernando_row.date_reported, datetime.date(year=2017, month=9, day=1))
def state_aggregate() -> Tuple[str, HTTPStatus]: """Calls state aggregates""" bucket = get_str_param_value("bucket", request.args) state = get_str_param_value("state", request.args) filename = get_str_param_value("filename", request.args) project_id = metadata.project_id() logging.info("The project id is %s", project_id) if not bucket or not state or not filename: raise StateAggregateError("All of state, bucket, and filename must be provided") directory_path = GcsfsDirectoryPath(bucket, state) path = GcsfsFilePath.from_directory_and_file_name(directory_path, filename) parser = STATE_TO_PARSER[state] fs = GcsfsFactory.build() logging.info("The path to download from is %s", path) logging.info("The files in the directory are:") logging.info( fs.ls_with_blob_prefix( bucket_name=directory_path.bucket_name, blob_prefix=directory_path.relative_path, ) ) # Providing a stream buffer to tabula reader does not work because it # tries to load the file into the local filesystem, since appengine is a # read only filesystem (except for the tmpdir) we download the file into # the local tmpdir and pass that in. handle = fs.download_to_temp_file(path) if not handle: raise StateAggregateError(f"Unable to download file: {path}") logging.info("Successfully downloaded file from gcs: %s", handle.local_file_path) result = parser(handle.local_file_path) logging.info("Successfully parsed the report") for table, df in result.items(): dao.write_df(table, df) # If we are successful, we want to move the file out of the cloud # function triggered directory, and into the historical path. historical_path = GcsfsFilePath.from_directory_and_file_name( GcsfsDirectoryPath(HISTORICAL_BUCKET.format(project_id), state), filename ) fs.mv(path, historical_path) return "", HTTPStatus.OK
def testWriteDf(self): # Arrange subject = pd.DataFrame({ "county_name": ["Alachua", "Baker", "Bay", "Bradford", "Brevard"], "county_population": [257062, 26965, 176016, 27440, 568919], "average_daily_population": [799, 478, 1015, 141, 1547], "date_reported": [ pd.NaT, pd.NaT, datetime.datetime(year=2017, month=9, day=1), pd.NaT, pd.NaT, ], "fips": ["00000", "00001", "00002", "00003", "00004"], "report_date": 5 * [DATE_SCRAPED], "aggregation_window": 5 * [enum_strings.monthly_granularity], "report_frequency": 5 * [enum_strings.monthly_granularity], }) # Act dao.write_df(FlCountyAggregate, subject) # Assert with SessionFactory.using_database(self.database_key, autocommit=False) as session: query = session.query(FlCountyAggregate).filter( FlCountyAggregate.county_name == "Bay") result = one(query.all()) self.assertEqual(result.county_name, "Bay") self.assertEqual(result.county_population, 176016) self.assertEqual(result.average_daily_population, 1015) self.assertEqual(result.date_reported, datetime.date(year=2017, month=9, day=1)) self.assertEqual(result.fips, "00002") self.assertEqual(result.report_date, DATE_SCRAPED) self.assertEqual(result.aggregation_window, enum_strings.monthly_granularity)
def testWriteDf_rowsWithSameColumnsThatMustBeUnique_onlyWritesOnce(self): # Arrange shared_fips = '12345' subject = pd.DataFrame({ 'county_name': ['Alachua', 'Baker'], 'county_population': [257062, 26965], 'average_daily_population': [799, 478], 'date_reported': [pd.NaT, pd.NaT], 'fips': 2 * [shared_fips], 'report_date': 2 * [DATE_SCRAPED], 'aggregation_window': 2 * [enum_strings.monthly_granularity], 'report_frequency': 2 * [enum_strings.monthly_granularity] }) # Act dao.write_df(FlCountyAggregate, subject) # Assert query = \ SessionFactory.for_schema_base(JailsBase).query(FlCountyAggregate) self.assertEqual(len(query.all()), 1)
def testWrite_CorrectlyReadsHernandoCounty(self) -> None: if not self.parsed_pdf: raise ValueError("Unexpectedly empty parsed_pdf") # Act for table, df in self.parsed_pdf.items(): dao.write_df(table, df) # Assert with SessionFactory.using_database(self.database_key, autocommit=False) as session: query = session.query(FlCountyAggregate).filter( FlCountyAggregate.county_name == "Hernando") hernando_row = one(query.all()) self.assertEqual(hernando_row.county_name, "Hernando") self.assertEqual(hernando_row.county_population, 179503) self.assertEqual(hernando_row.average_daily_population, 632) self.assertEqual(hernando_row.date_reported, datetime.date(year=2017, month=9, day=1))