Python read_in_s3_csv 예제들, data_research.mortgage_utilities.s3_utils.read_in_s3_csv Python 예제들

예제 #1

0

파일 보기

파일: load_mortgage_performance_csv.py 프로젝트: OrlandoSoto/cfgov-refresh

def load_values(return_fips=False):
    """
    Drop and reload the CountyMortgageData table, or just return a FIPS list.

    This is not used in the data pipeline and is mainly for local testing.
    Passing `return_fips=True` will return a sorted list of source FIPS values.
    The script assumes that `starting_date` and `through_date`
    have been set in constants.
    """

    counter = 0
    source_url = "{}/{}".format(S3_SOURCE_BUCKET, S3_SOURCE_FILE)
    starting_date = MortgageDataConstant.objects.get(
        name='starting_date').date_value
    through_date = MortgageDataConstant.objects.get(
        name='through_date').date_value
    raw_data = read_in_s3_csv(source_url)
    # raw_data is a generator delivering data dicts, each representing a row
    if return_fips is True:
        fips_list = [validate_fips(row.get('fips')) for row in raw_data]
        return sorted(set(fips_list))
    logger.info("Deleting CountyMortgageData objects.")
    CountyMortgageData.objects.all().delete()
    logger.info("CountyMorgtgageData count is now {}".format(
        CountyMortgageData.objects.count()))
    for row in raw_data:
        sampling_date = parser.parse(row.get('date')).date()
        if sampling_date >= starting_date and sampling_date <= through_date:
            valid_fips = validate_fips(row.get('fips'))
            if valid_fips:
                county = County.objects.get(fips=valid_fips)
                obj = CountyMortgageData(
                    fips=valid_fips,
                    county=county,
                    date=sampling_date,
                    total=int(row.get('open')),
                    current=int(row.get('current')),
                    thirty=int(row.get('thirty')),
                    sixty=int(row.get('sixty')),
                    ninety=int(row.get('ninety')),
                    other=int(row.get('other')))
                obj.save()
                counter += 1
                if counter % 10000 == 0:  # pragma: no cover
                    sys.stdout.write('.')
                    sys.stdout.flush()
                if counter % 100000 == 0:  # pragma: no cover
                    logger.info("\n{}".format(counter))
    logger.info("\nCreated {} CountyMortgageData objects".format(
        CountyMortgageData.objects.count()))

예제 #2

0

파일 보기

파일: load_mortgage_performance_csv.py 프로젝트: isabella232/consumerfinance.gov

def load_values(return_fips=False):
    """
    Drop and reload the CountyMortgageData table, or just return a FIPS list.

    This is not used in the data pipeline and is mainly for local testing.
    Passing `return_fips=True` will return a sorted list of source FIPS values.
    The script assumes that `starting_date` and `through_date`
    have been set in constants.
    """

    counter = 0
    source_url = "{}/{}".format(S3_SOURCE_BUCKET, S3_SOURCE_FILE)
    starting_date = MortgageDataConstant.objects.get(
        name='starting_date').date_value
    through_date = MortgageDataConstant.objects.get(
        name='through_date').date_value
    raw_data = read_in_s3_csv(source_url)
    # raw_data is a generator delivering data dicts, each representing a row
    if return_fips is True:
        fips_list = [validate_fips(row.get('fips')) for row in raw_data]
        return sorted(set(fips_list))
    logger.info("Deleting CountyMortgageData objects.")
    CountyMortgageData.objects.all().delete()
    logger.info("CountyMorgtgageData count is now {}".format(
        CountyMortgageData.objects.count()))
    for row in raw_data:
        sampling_date = parser.parse(row.get('date')).date()
        if sampling_date >= starting_date and sampling_date <= through_date:
            valid_fips = validate_fips(row.get('fips'))
            if valid_fips:
                county = County.objects.get(fips=valid_fips)
                obj = CountyMortgageData(
                    fips=valid_fips,
                    county=county,
                    date=sampling_date,
                    total=int(row.get('open')),
                    current=int(row.get('current')),
                    thirty=int(row.get('thirty')),
                    sixty=int(row.get('sixty')),
                    ninety=int(row.get('ninety')),
                    other=int(row.get('other')))
                obj.save()
                counter += 1
                if counter % 10000 == 0:  # pragma: no cover
                    sys.stdout.write('.')
                    sys.stdout.flush()
                if counter % 100000 == 0:  # pragma: no cover
                    logger.info("\n{}".format(counter))
    logger.info("\nCreated {} CountyMortgageData objects".format(
        CountyMortgageData.objects.count()))

예제 #3

0

파일 보기

파일: source_to_dump.py 프로젝트: yunghuffy/cfgov-refresh

def create_dump(
        starting_date, through_date, dump_slug, sql=True):
    """
    Sample input CSV field_names and row:
    date,fips,open,current,thirty,sixty,ninety,other
    01/01/08,1001,268,260,4,1,0,3

    Default is to dump SQL for mysql loading. Alternative is to dump CSV.
    CSV is portable and less brittle, but our mysql setup doesn't allow it.
    If we switch to Postgres, we can make CSV the default.
    """

    starter = datetime.datetime.now()
    counter = 0
    pk = 1
    rows_out = []
    source_url = "{}/{}".format(S3_SOURCE_BUCKET, S3_SOURCE_FILE)
    raw_data = read_in_s3_csv(source_url)
    for row in raw_data:
        sampling_date = parser.parse(row.get('date')).date()
        if sampling_date >= starting_date and sampling_date <= through_date:
            valid_fips = validate_fips(row.get('fips'))
            if valid_fips:
                county_pk = County.objects.get(fips=valid_fips).pk
                rows_out.append([
                    pk,
                    valid_fips,
                    "{}".format(sampling_date),
                    row.get('open'),
                    row.get('current'),
                    row.get('thirty'),
                    row.get('sixty'),
                    row.get('ninety'),
                    row.get('other'),
                    county_pk])
                pk += 1
                counter += 1
                if counter % 10000 == 0:  # pragma: no cover
                    sys.stdout.write('.')
                    sys.stdout.flush()
                if counter % 100000 == 0:  # pragma: no cover
                    logger.info("\n{}".format(counter))
    if sql is True:
        dump_as_sql(rows_out, dump_slug)
    else:
        dump_as_csv(rows_out, dump_slug)
    logger.info('\nceate_dump took {} to create a file with {} rows'.format(
        (datetime.datetime.now() - starter), len(rows_out)))

예제 #4

0

파일 보기

파일: load_mortgage_performance_csv.py 프로젝트: willbarton/cfgov-refresh

def load_values(s3_filename, starting_date, return_fips=False):
    """Drop and reload the CountyMortgageData table."""

    counter = 0
    source_url = "{}/{}".format(S3_SOURCE_BUCKET, s3_filename)
    logger.info("Deleting CountyMortgageData objects.")
    CountyMortgageData.objects.all().delete()
    logger.info("CountyMorgtgageData count is now {}".format(
        CountyMortgageData.objects.count()))
    raw_data = read_in_s3_csv(source_url)
    # raw_data is a generator delivering data dicts, each representing a row
    if return_fips is True:
        fips_list = [validate_fips(row.get('fips')) for row in raw_data]
        return sorted(set(fips_list))
    for row in raw_data:
        valid_fips = validate_fips(row.get('fips'))
        sampling_date = parser.parse(row.get('date')).date()
        if valid_fips and sampling_date >= starting_date:
            county = County.objects.get(fips=valid_fips)
            obj = CountyMortgageData(
                fips=valid_fips,
                county=county,
                date=sampling_date,
                total=int(row.get('open')),
                current=int(row.get('current')),
                thirty=int(row.get('thirty')),
                sixty=int(row.get('sixty')),
                ninety=int(row.get('ninety')),
                other=int(row.get('other')))
            obj.save()
            counter += 1
            if counter % 10000 == 0:  # pragma: no cover
                sys.stdout.write('.')
                sys.stdout.flush()
            if counter % 100000 == 0:  # pragma: no cover
                logger.info("\n{}".format(counter))
    merge_the_dades()
    update_sampling_dates()
    logger.info("Created {} CountyMortgageData objects".format(
        CountyMortgageData.objects.count()))

예제 #5

0

파일 보기

파일: process_mortgage_data.py 프로젝트: chosak/cfgov-refresh

def process_source(
        starting_date, through_date, dump_slug=None):
    """
    Re-generate aggregated data from the latest source CSV posted to S3.

    This operation has three steps
    - Wipe and regenerate the base county_mortgage_data table.
    - Regenerate aggregated data for MSAs, non-MSAs, states and national.
    - Update metadata values and files.
    - Export new downloadable public CSV files.

    If dump_slug is provided, a CSV the base county tables will be dumped.

    The input CSV has the following field_names and row form:
    date,fips,open,current,thirty,sixty,ninety,other
    01/01/08,1001,268,260,4,1,0,3

    """
    starter = datetime.datetime.now()
    counter = 0
    pk = 1
    new_objects = []
    # truncate table
    CountyMortgageData.objects.all().delete()
    source_url = "{}/{}".format(S3_SOURCE_BUCKET, S3_SOURCE_FILE)
    raw_data = read_in_s3_csv(source_url)
    for row in raw_data:
        sampling_date = parser.parse(row.get('date')).date()
        if sampling_date >= starting_date and sampling_date <= through_date:
            valid_fips = validate_fips(row.get('fips'))
            if valid_fips:
                county = County.objects.get(fips=valid_fips)
                new_objects.append(
                    CountyMortgageData(
                        pk=pk,
                        fips=valid_fips,
                        date=sampling_date,
                        total=row.get('open'),
                        current=row.get('current'),
                        thirty=row.get('thirty'),
                        sixty=row.get('sixty'),
                        ninety=row.get('ninety'),
                        other=row.get('other'),
                        county=county
                    ))
                pk += 1
                counter += 1
                if counter % 10000 == 0:  # pragma: no cover
                    sys.stdout.write('.')
                    sys.stdout.flush()
                if counter % 100000 == 0:  # pragma: no cover
                    logger.info("\n{}".format(counter))
    CountyMortgageData.objects.bulk_create(new_objects)
    logger.info('\n{} took {} '
                'to create {} countymortgage records'.format(
                    SCRIPT_NAME,
                    (datetime.datetime.now() - starter),
                    len(new_objects)))
    if dump_slug:
        rows = []
        for obj in new_objects:
            rows.append([
                obj.pk,
                obj.fips,
                "{}".format(obj.date),
                obj.total,
                obj.current,
                obj.thirty,
                obj.sixty,
                obj.ninety,
                obj.other,
                county.pk
            ])
        dump_as_csv(rows, dump_slug)

예제 #6

0

파일 보기

파일: test_s3_utils.py 프로젝트: willbarton/cfgov-refresh

 def test_read_in_s3_csv(self, mock_requests):
     mock_requests.return_value.content = 'a,b,c\nd,e,f'
     reader = read_in_s3_csv('fake-s3-url.com')
     self.assertEqual(mock_requests.call_count, 1)
     self.assertEqual(reader.fieldnames, ['a', 'b', 'c'])
     self.assertEqual(sorted(reader.next().values()), ['d', 'e', 'f'])

예제 #7

0

파일 보기

def process_source(starting_date, through_date, dump_slug=None):
    """
    Re-generate aggregated data from the latest source CSV posted to S3.

    This operation has three steps
    - Wipe and regenerate the base county_mortgage_data table.
    - Regenerate aggregated data for MSAs, non-MSAs, states and national.
    - Update metadata values and files.
    - Export new downloadable public CSV files.

    If dump_slug is provided, a CSV the base county tables will be dumped.

    The input CSV has the following field_names and row form:
    date,fips,open,current,thirty,sixty,ninety,other
    01/01/08,1001,268,260,4,1,0,3

    """
    starter = datetime.datetime.now()
    counter = 0
    pk = 1
    new_objects = []
    # truncate table
    CountyMortgageData.objects.all().delete()
    source_url = "{}/{}".format(S3_SOURCE_BUCKET, S3_SOURCE_FILE)
    raw_data = read_in_s3_csv(source_url)
    for row in raw_data:
        sampling_date = parser.parse(row.get('date')).date()
        if sampling_date >= starting_date and sampling_date <= through_date:
            valid_fips = validate_fips(row.get('fips'))
            if valid_fips:
                county = County.objects.get(fips=valid_fips)
                new_objects.append(
                    CountyMortgageData(pk=pk,
                                       fips=valid_fips,
                                       date=sampling_date,
                                       total=row.get('open'),
                                       current=row.get('current'),
                                       thirty=row.get('thirty'),
                                       sixty=row.get('sixty'),
                                       ninety=row.get('ninety'),
                                       other=row.get('other'),
                                       county=county))
                pk += 1
                counter += 1
                if counter % 10000 == 0:  # pragma: no cover
                    sys.stdout.write('.')
                    sys.stdout.flush()
                if counter % 100000 == 0:  # pragma: no cover
                    logger.info("\n{}".format(counter))
    CountyMortgageData.objects.bulk_create(new_objects)
    logger.info('\n{} took {} '
                'to create {} countymortgage records'.format(
                    SCRIPT_NAME, (datetime.datetime.now() - starter),
                    len(new_objects)))
    if dump_slug:
        dump_as_csv(((
            obj.pk,
            obj.fips,
            "{}".format(obj.date),
            obj.total,
            obj.current,
            obj.thirty,
            obj.sixty,
            obj.ninety,
            obj.other,
            obj.county.pk,
        ) for obj in new_objects), dump_slug)

예제 #8

0

파일 보기

 def test_read_in_s3_csv(self):
     url = 'https://test.url/foo.csv'
     responses.add(responses.GET, url, body='a,b,c\nd,e,f')
     reader = read_in_s3_csv(url)
     self.assertEqual(reader.fieldnames, ['a', 'b', 'c'])
     self.assertEqual(sorted(next(reader).values()), ['d', 'e', 'f'])

예제 #9

0

파일 보기

파일: test_s3_utils.py 프로젝트: chosak/cfgov-refresh

 def test_read_in_s3_csv(self):
     url = 'https://test.url/foo.csv'
     responses.add(responses.GET, url, body='a,b,c\nd,e,f')
     reader = read_in_s3_csv(url)
     self.assertEqual(reader.fieldnames, ['a', 'b', 'c'])
     self.assertEqual(sorted(reader.next().values()), ['d', 'e', 'f'])

예제 #10

0

파일 보기

 def test_read_in_s3_csv(self, mock_requests):
     mock_requests.return_value.content = 'a,b,c\nd,e,f'
     reader = read_in_s3_csv('fake-s3-url.com')
     self.assertEqual(mock_requests.call_count, 1)
     self.assertEqual(reader.fieldnames, ['a', 'b', 'c'])
     self.assertEqual(sorted(reader.next().values()), ['d', 'e', 'f'])