def load_values(return_fips=False): """ Drop and reload the CountyMortgageData table, or just return a FIPS list. This is not used in the data pipeline and is mainly for local testing. Passing `return_fips=True` will return a sorted list of source FIPS values. The script assumes that `starting_date` and `through_date` have been set in constants. """ counter = 0 source_url = "{}/{}".format(S3_SOURCE_BUCKET, S3_SOURCE_FILE) starting_date = MortgageDataConstant.objects.get( name='starting_date').date_value through_date = MortgageDataConstant.objects.get( name='through_date').date_value raw_data = read_in_s3_csv(source_url) # raw_data is a generator delivering data dicts, each representing a row if return_fips is True: fips_list = [validate_fips(row.get('fips')) for row in raw_data] return sorted(set(fips_list)) logger.info("Deleting CountyMortgageData objects.") CountyMortgageData.objects.all().delete() logger.info("CountyMorgtgageData count is now {}".format( CountyMortgageData.objects.count())) for row in raw_data: sampling_date = parser.parse(row.get('date')).date() if sampling_date >= starting_date and sampling_date <= through_date: valid_fips = validate_fips(row.get('fips')) if valid_fips: county = County.objects.get(fips=valid_fips) obj = CountyMortgageData( fips=valid_fips, county=county, date=sampling_date, total=int(row.get('open')), current=int(row.get('current')), thirty=int(row.get('thirty')), sixty=int(row.get('sixty')), ninety=int(row.get('ninety')), other=int(row.get('other'))) obj.save() counter += 1 if counter % 10000 == 0: # pragma: no cover sys.stdout.write('.') sys.stdout.flush() if counter % 100000 == 0: # pragma: no cover logger.info("\n{}".format(counter)) logger.info("\nCreated {} CountyMortgageData objects".format( CountyMortgageData.objects.count()))
def load_values(return_fips=False): """ Drop and reload the CountyMortgageData table, or just return a FIPS list. This is not used in the data pipeline and is mainly for local testing. Passing `return_fips=True` will return a sorted list of source FIPS values. The script assumes that `starting_date` and `through_date` have been set in constants. """ counter = 0 source_url = "{}/{}".format(S3_SOURCE_BUCKET, S3_SOURCE_FILE) starting_date = MortgageDataConstant.objects.get( name='starting_date').date_value through_date = MortgageDataConstant.objects.get( name='through_date').date_value raw_data = read_in_s3_csv(source_url) # raw_data is a generator delivering data dicts, each representing a row if return_fips is True: fips_list = [validate_fips(row.get('fips')) for row in raw_data] return sorted(set(fips_list)) logger.info("Deleting CountyMortgageData objects.") CountyMortgageData.objects.all().delete() logger.info("CountyMorgtgageData count is now {}".format( CountyMortgageData.objects.count())) for row in raw_data: sampling_date = parser.parse(row.get('date')).date() if sampling_date >= starting_date and sampling_date <= through_date: valid_fips = validate_fips(row.get('fips')) if valid_fips: county = County.objects.get(fips=valid_fips) obj = CountyMortgageData( fips=valid_fips, county=county, date=sampling_date, total=int(row.get('open')), current=int(row.get('current')), thirty=int(row.get('thirty')), sixty=int(row.get('sixty')), ninety=int(row.get('ninety')), other=int(row.get('other'))) obj.save() counter += 1 if counter % 10000 == 0: # pragma: no cover sys.stdout.write('.') sys.stdout.flush() if counter % 100000 == 0: # pragma: no cover logger.info("\n{}".format(counter)) logger.info("\nCreated {} CountyMortgageData objects".format( CountyMortgageData.objects.count()))
def create_dump( starting_date, through_date, dump_slug, sql=True): """ Sample input CSV field_names and row: date,fips,open,current,thirty,sixty,ninety,other 01/01/08,1001,268,260,4,1,0,3 Default is to dump SQL for mysql loading. Alternative is to dump CSV. CSV is portable and less brittle, but our mysql setup doesn't allow it. If we switch to Postgres, we can make CSV the default. """ starter = datetime.datetime.now() counter = 0 pk = 1 rows_out = [] source_url = "{}/{}".format(S3_SOURCE_BUCKET, S3_SOURCE_FILE) raw_data = read_in_s3_csv(source_url) for row in raw_data: sampling_date = parser.parse(row.get('date')).date() if sampling_date >= starting_date and sampling_date <= through_date: valid_fips = validate_fips(row.get('fips')) if valid_fips: county_pk = County.objects.get(fips=valid_fips).pk rows_out.append([ pk, valid_fips, "{}".format(sampling_date), row.get('open'), row.get('current'), row.get('thirty'), row.get('sixty'), row.get('ninety'), row.get('other'), county_pk]) pk += 1 counter += 1 if counter % 10000 == 0: # pragma: no cover sys.stdout.write('.') sys.stdout.flush() if counter % 100000 == 0: # pragma: no cover logger.info("\n{}".format(counter)) if sql is True: dump_as_sql(rows_out, dump_slug) else: dump_as_csv(rows_out, dump_slug) logger.info('\nceate_dump took {} to create a file with {} rows'.format( (datetime.datetime.now() - starter), len(rows_out)))
def load_values(s3_filename, starting_date, return_fips=False): """Drop and reload the CountyMortgageData table.""" counter = 0 source_url = "{}/{}".format(S3_SOURCE_BUCKET, s3_filename) logger.info("Deleting CountyMortgageData objects.") CountyMortgageData.objects.all().delete() logger.info("CountyMorgtgageData count is now {}".format( CountyMortgageData.objects.count())) raw_data = read_in_s3_csv(source_url) # raw_data is a generator delivering data dicts, each representing a row if return_fips is True: fips_list = [validate_fips(row.get('fips')) for row in raw_data] return sorted(set(fips_list)) for row in raw_data: valid_fips = validate_fips(row.get('fips')) sampling_date = parser.parse(row.get('date')).date() if valid_fips and sampling_date >= starting_date: county = County.objects.get(fips=valid_fips) obj = CountyMortgageData( fips=valid_fips, county=county, date=sampling_date, total=int(row.get('open')), current=int(row.get('current')), thirty=int(row.get('thirty')), sixty=int(row.get('sixty')), ninety=int(row.get('ninety')), other=int(row.get('other'))) obj.save() counter += 1 if counter % 10000 == 0: # pragma: no cover sys.stdout.write('.') sys.stdout.flush() if counter % 100000 == 0: # pragma: no cover logger.info("\n{}".format(counter)) merge_the_dades() update_sampling_dates() logger.info("Created {} CountyMortgageData objects".format( CountyMortgageData.objects.count()))
def process_source( starting_date, through_date, dump_slug=None): """ Re-generate aggregated data from the latest source CSV posted to S3. This operation has three steps - Wipe and regenerate the base county_mortgage_data table. - Regenerate aggregated data for MSAs, non-MSAs, states and national. - Update metadata values and files. - Export new downloadable public CSV files. If dump_slug is provided, a CSV the base county tables will be dumped. The input CSV has the following field_names and row form: date,fips,open,current,thirty,sixty,ninety,other 01/01/08,1001,268,260,4,1,0,3 """ starter = datetime.datetime.now() counter = 0 pk = 1 new_objects = [] # truncate table CountyMortgageData.objects.all().delete() source_url = "{}/{}".format(S3_SOURCE_BUCKET, S3_SOURCE_FILE) raw_data = read_in_s3_csv(source_url) for row in raw_data: sampling_date = parser.parse(row.get('date')).date() if sampling_date >= starting_date and sampling_date <= through_date: valid_fips = validate_fips(row.get('fips')) if valid_fips: county = County.objects.get(fips=valid_fips) new_objects.append( CountyMortgageData( pk=pk, fips=valid_fips, date=sampling_date, total=row.get('open'), current=row.get('current'), thirty=row.get('thirty'), sixty=row.get('sixty'), ninety=row.get('ninety'), other=row.get('other'), county=county )) pk += 1 counter += 1 if counter % 10000 == 0: # pragma: no cover sys.stdout.write('.') sys.stdout.flush() if counter % 100000 == 0: # pragma: no cover logger.info("\n{}".format(counter)) CountyMortgageData.objects.bulk_create(new_objects) logger.info('\n{} took {} ' 'to create {} countymortgage records'.format( SCRIPT_NAME, (datetime.datetime.now() - starter), len(new_objects))) if dump_slug: rows = [] for obj in new_objects: rows.append([ obj.pk, obj.fips, "{}".format(obj.date), obj.total, obj.current, obj.thirty, obj.sixty, obj.ninety, obj.other, county.pk ]) dump_as_csv(rows, dump_slug)
def test_read_in_s3_csv(self, mock_requests): mock_requests.return_value.content = 'a,b,c\nd,e,f' reader = read_in_s3_csv('fake-s3-url.com') self.assertEqual(mock_requests.call_count, 1) self.assertEqual(reader.fieldnames, ['a', 'b', 'c']) self.assertEqual(sorted(reader.next().values()), ['d', 'e', 'f'])
def process_source(starting_date, through_date, dump_slug=None): """ Re-generate aggregated data from the latest source CSV posted to S3. This operation has three steps - Wipe and regenerate the base county_mortgage_data table. - Regenerate aggregated data for MSAs, non-MSAs, states and national. - Update metadata values and files. - Export new downloadable public CSV files. If dump_slug is provided, a CSV the base county tables will be dumped. The input CSV has the following field_names and row form: date,fips,open,current,thirty,sixty,ninety,other 01/01/08,1001,268,260,4,1,0,3 """ starter = datetime.datetime.now() counter = 0 pk = 1 new_objects = [] # truncate table CountyMortgageData.objects.all().delete() source_url = "{}/{}".format(S3_SOURCE_BUCKET, S3_SOURCE_FILE) raw_data = read_in_s3_csv(source_url) for row in raw_data: sampling_date = parser.parse(row.get('date')).date() if sampling_date >= starting_date and sampling_date <= through_date: valid_fips = validate_fips(row.get('fips')) if valid_fips: county = County.objects.get(fips=valid_fips) new_objects.append( CountyMortgageData(pk=pk, fips=valid_fips, date=sampling_date, total=row.get('open'), current=row.get('current'), thirty=row.get('thirty'), sixty=row.get('sixty'), ninety=row.get('ninety'), other=row.get('other'), county=county)) pk += 1 counter += 1 if counter % 10000 == 0: # pragma: no cover sys.stdout.write('.') sys.stdout.flush() if counter % 100000 == 0: # pragma: no cover logger.info("\n{}".format(counter)) CountyMortgageData.objects.bulk_create(new_objects) logger.info('\n{} took {} ' 'to create {} countymortgage records'.format( SCRIPT_NAME, (datetime.datetime.now() - starter), len(new_objects))) if dump_slug: dump_as_csv((( obj.pk, obj.fips, "{}".format(obj.date), obj.total, obj.current, obj.thirty, obj.sixty, obj.ninety, obj.other, obj.county.pk, ) for obj in new_objects), dump_slug)
def test_read_in_s3_csv(self): url = 'https://test.url/foo.csv' responses.add(responses.GET, url, body='a,b,c\nd,e,f') reader = read_in_s3_csv(url) self.assertEqual(reader.fieldnames, ['a', 'b', 'c']) self.assertEqual(sorted(next(reader).values()), ['d', 'e', 'f'])
def test_read_in_s3_csv(self): url = 'https://test.url/foo.csv' responses.add(responses.GET, url, body='a,b,c\nd,e,f') reader = read_in_s3_csv(url) self.assertEqual(reader.fieldnames, ['a', 'b', 'c']) self.assertEqual(sorted(reader.next().values()), ['d', 'e', 'f'])
def test_read_in_s3_csv(self, mock_requests): mock_requests.return_value.content = 'a,b,c\nd,e,f' reader = read_in_s3_csv('fake-s3-url.com') self.assertEqual(mock_requests.call_count, 1) self.assertEqual(reader.fieldnames, ['a', 'b', 'c']) self.assertEqual(sorted(reader.next().values()), ['d', 'e', 'f'])