def load_values(return_fips=False): """ Drop and reload the CountyMortgageData table, or just return a FIPS list. This is not used in the data pipeline and is mainly for local testing. Passing `return_fips=True` will return a sorted list of source FIPS values. The script assumes that `starting_date` and `through_date` have been set in constants. """ counter = 0 source_url = "{}/{}".format(S3_SOURCE_BUCKET, S3_SOURCE_FILE) starting_date = MortgageDataConstant.objects.get( name='starting_date').date_value through_date = MortgageDataConstant.objects.get( name='through_date').date_value raw_data = read_in_s3_csv(source_url) # raw_data is a generator delivering data dicts, each representing a row if return_fips is True: fips_list = [validate_fips(row.get('fips')) for row in raw_data] return sorted(set(fips_list)) logger.info("Deleting CountyMortgageData objects.") CountyMortgageData.objects.all().delete() logger.info("CountyMorgtgageData count is now {}".format( CountyMortgageData.objects.count())) for row in raw_data: sampling_date = parser.parse(row.get('date')).date() if sampling_date >= starting_date and sampling_date <= through_date: valid_fips = validate_fips(row.get('fips')) if valid_fips: county = County.objects.get(fips=valid_fips) obj = CountyMortgageData( fips=valid_fips, county=county, date=sampling_date, total=int(row.get('open')), current=int(row.get('current')), thirty=int(row.get('thirty')), sixty=int(row.get('sixty')), ninety=int(row.get('ninety')), other=int(row.get('other'))) obj.save() counter += 1 if counter % 10000 == 0: # pragma: no cover sys.stdout.write('.') sys.stdout.flush() if counter % 100000 == 0: # pragma: no cover logger.info("\n{}".format(counter)) logger.info("\nCreated {} CountyMortgageData objects".format( CountyMortgageData.objects.count()))
def load_values(return_fips=False): """ Drop and reload the CountyMortgageData table, or just return a FIPS list. This is not used in the data pipeline and is mainly for local testing. Passing `return_fips=True` will return a sorted list of source FIPS values. The script assumes that `starting_date` and `through_date` have been set in constants. """ counter = 0 source_url = "{}/{}".format(S3_SOURCE_BUCKET, S3_SOURCE_FILE) starting_date = MortgageDataConstant.objects.get( name='starting_date').date_value through_date = MortgageDataConstant.objects.get( name='through_date').date_value raw_data = read_in_s3_csv(source_url) # raw_data is a generator delivering data dicts, each representing a row if return_fips is True: fips_list = [validate_fips(row.get('fips')) for row in raw_data] return sorted(set(fips_list)) logger.info("Deleting CountyMortgageData objects.") CountyMortgageData.objects.all().delete() logger.info("CountyMorgtgageData count is now {}".format( CountyMortgageData.objects.count())) for row in raw_data: sampling_date = parser.parse(row.get('date')).date() if sampling_date >= starting_date and sampling_date <= through_date: valid_fips = validate_fips(row.get('fips')) if valid_fips: county = County.objects.get(fips=valid_fips) obj = CountyMortgageData( fips=valid_fips, county=county, date=sampling_date, total=int(row.get('open')), current=int(row.get('current')), thirty=int(row.get('thirty')), sixty=int(row.get('sixty')), ninety=int(row.get('ninety')), other=int(row.get('other'))) obj.save() counter += 1 if counter % 10000 == 0: # pragma: no cover sys.stdout.write('.') sys.stdout.flush() if counter % 100000 == 0: # pragma: no cover logger.info("\n{}".format(counter)) logger.info("\nCreated {} CountyMortgageData objects".format( CountyMortgageData.objects.count()))
def create_dump( starting_date, through_date, dump_slug, sql=True): """ Sample input CSV field_names and row: date,fips,open,current,thirty,sixty,ninety,other 01/01/08,1001,268,260,4,1,0,3 Default is to dump SQL for mysql loading. Alternative is to dump CSV. CSV is portable and less brittle, but our mysql setup doesn't allow it. If we switch to Postgres, we can make CSV the default. """ starter = datetime.datetime.now() counter = 0 pk = 1 rows_out = [] source_url = "{}/{}".format(S3_SOURCE_BUCKET, S3_SOURCE_FILE) raw_data = read_in_s3_csv(source_url) for row in raw_data: sampling_date = parser.parse(row.get('date')).date() if sampling_date >= starting_date and sampling_date <= through_date: valid_fips = validate_fips(row.get('fips')) if valid_fips: county_pk = County.objects.get(fips=valid_fips).pk rows_out.append([ pk, valid_fips, "{}".format(sampling_date), row.get('open'), row.get('current'), row.get('thirty'), row.get('sixty'), row.get('ninety'), row.get('other'), county_pk]) pk += 1 counter += 1 if counter % 10000 == 0: # pragma: no cover sys.stdout.write('.') sys.stdout.flush() if counter % 100000 == 0: # pragma: no cover logger.info("\n{}".format(counter)) if sql is True: dump_as_sql(rows_out, dump_slug) else: dump_as_csv(rows_out, dump_slug) logger.info('\nceate_dump took {} to create a file with {} rows'.format( (datetime.datetime.now() - starter), len(rows_out)))
def test_validate_fips_too_long(self): fips_input = '123456' self.assertEqual(validate_fips(fips_input), None)
def test_validate_fips_too_short(self): fips_input = '12' self.assertEqual(validate_fips(fips_input), None)
def test_validate_fips_keep_outdated(self): fips_input = '02201' # a normally excluded outdated FIPS code self.assertEqual(validate_fips( fips_input, keep_outdated=True), '02201')
def test_validate_fips_outdated_fips(self): fips_input = '02201' # a normally excluded outdated FIPS code self.assertIs(validate_fips(fips_input), None)
def test_validate_fips_outdated_fips(self): fips_input = '02201' # a normally excluded outdated FIPS code self.assertIs(validate_fips(fips_input), None)
def test_validate_fips_invalid_5_digit(self): fips_input = '02201' self.assertEqual(validate_fips(fips_input), None)
def test_validate_fips_edge_case(self): fips_input = '46113' self.assertEqual(validate_fips(fips_input), '46102')
def test_validate_fips_too_long(self): fips_input = '123456' self.assertEqual(validate_fips(fips_input), None)
def test_validate_fips_too_short(self): fips_input = '12' self.assertEqual(validate_fips(fips_input), None)
def process_source( starting_date, through_date, dump_slug=None): """ Re-generate aggregated data from the latest source CSV posted to S3. This operation has three steps - Wipe and regenerate the base county_mortgage_data table. - Regenerate aggregated data for MSAs, non-MSAs, states and national. - Update metadata values and files. - Export new downloadable public CSV files. If dump_slug is provided, a CSV the base county tables will be dumped. The input CSV has the following field_names and row form: date,fips,open,current,thirty,sixty,ninety,other 01/01/08,1001,268,260,4,1,0,3 """ starter = datetime.datetime.now() counter = 0 pk = 1 new_objects = [] # truncate table CountyMortgageData.objects.all().delete() source_url = "{}/{}".format(S3_SOURCE_BUCKET, S3_SOURCE_FILE) raw_data = read_in_s3_csv(source_url) for row in raw_data: sampling_date = parser.parse(row.get('date')).date() if sampling_date >= starting_date and sampling_date <= through_date: valid_fips = validate_fips(row.get('fips')) if valid_fips: county = County.objects.get(fips=valid_fips) new_objects.append( CountyMortgageData( pk=pk, fips=valid_fips, date=sampling_date, total=row.get('open'), current=row.get('current'), thirty=row.get('thirty'), sixty=row.get('sixty'), ninety=row.get('ninety'), other=row.get('other'), county=county )) pk += 1 counter += 1 if counter % 10000 == 0: # pragma: no cover sys.stdout.write('.') sys.stdout.flush() if counter % 100000 == 0: # pragma: no cover logger.info("\n{}".format(counter)) CountyMortgageData.objects.bulk_create(new_objects) logger.info('\n{} took {} ' 'to create {} countymortgage records'.format( SCRIPT_NAME, (datetime.datetime.now() - starter), len(new_objects))) if dump_slug: rows = [] for obj in new_objects: rows.append([ obj.pk, obj.fips, "{}".format(obj.date), obj.total, obj.current, obj.thirty, obj.sixty, obj.ninety, obj.other, county.pk ]) dump_as_csv(rows, dump_slug)
def process_source(starting_date, through_date, dump_slug=None): """ Re-generate aggregated data from the latest source CSV posted to S3. This operation has three steps - Wipe and regenerate the base county_mortgage_data table. - Regenerate aggregated data for MSAs, non-MSAs, states and national. - Update metadata values and files. - Export new downloadable public CSV files. If dump_slug is provided, a CSV the base county tables will be dumped. The input CSV has the following field_names and row form: date,fips,open,current,thirty,sixty,ninety,other 01/01/08,1001,268,260,4,1,0,3 """ starter = datetime.datetime.now() counter = 0 pk = 1 new_objects = [] # truncate table CountyMortgageData.objects.all().delete() source_url = "{}/{}".format(S3_SOURCE_BUCKET, S3_SOURCE_FILE) raw_data = read_in_s3_csv(source_url) for row in raw_data: sampling_date = parser.parse(row.get('date')).date() if sampling_date >= starting_date and sampling_date <= through_date: valid_fips = validate_fips(row.get('fips')) if valid_fips: county = County.objects.get(fips=valid_fips) new_objects.append( CountyMortgageData(pk=pk, fips=valid_fips, date=sampling_date, total=row.get('open'), current=row.get('current'), thirty=row.get('thirty'), sixty=row.get('sixty'), ninety=row.get('ninety'), other=row.get('other'), county=county)) pk += 1 counter += 1 if counter % 10000 == 0: # pragma: no cover sys.stdout.write('.') sys.stdout.flush() if counter % 100000 == 0: # pragma: no cover logger.info("\n{}".format(counter)) CountyMortgageData.objects.bulk_create(new_objects) logger.info('\n{} took {} ' 'to create {} countymortgage records'.format( SCRIPT_NAME, (datetime.datetime.now() - starter), len(new_objects))) if dump_slug: dump_as_csv((( obj.pk, obj.fips, "{}".format(obj.date), obj.total, obj.current, obj.thirty, obj.sixty, obj.ninety, obj.other, obj.county.pk, ) for obj in new_objects), dump_slug)
def test_validate_fips_edge_case(self): fips_input = '46113' self.assertEqual(validate_fips(fips_input), '46102')
def test_validate_fips_4_digit(self): fips_input = '1015' self.assertEqual(validate_fips(fips_input), '01015')
def test_validate_fips_4_digit(self): fips_input = '1015' self.assertEqual(validate_fips(fips_input), '01015')
def test_validate_fips_valid_5_digit(self): fips_input = '34041' self.assertEqual(validate_fips(fips_input), '34041')
def test_validate_fips_invalid_5_digit(self): fips_input = '02201' self.assertEqual(validate_fips(fips_input), None)
def test_validate_fips_keep_outdated(self): fips_input = '02201' # a normally excluded outdated FIPS code self.assertEqual(validate_fips( fips_input, keep_outdated=True), '02201')
def test_validate_fips_valid_5_digit(self): fips_input = '34041' self.assertEqual(validate_fips(fips_input), '34041')