def load_object_class(base_path): """ This function loads Object classes into the database Args: base_path: directory that contains the domain values files. """ if CONFIG_BROKER["use_aws"]: s3_client = boto3.client('s3', region_name=CONFIG_BROKER['aws_region']) filename = s3_client.generate_presigned_url('get_object', {'Bucket': CONFIG_BROKER['sf_133_bucket'], 'Key': "object_class.csv"}, ExpiresIn=600) else: filename = os.path.join(base_path, "object_class.csv") # Load object class lookup table logger.info('Loading Object Class File: object_class.csv') with create_app().app_context(): sess = GlobalDB.db().session sess.query(ObjectClass).delete() data = pd.read_csv(filename, dtype=str) data = clean_data( data, ObjectClass, {"max_oc_code": "object_class_code", "max_object_class_name": "object_class_name"}, {"object_class_code": {"pad_to_length": 3}} ) # de-dupe data.drop_duplicates(subset=['object_class_code'], inplace=True) # insert to db table_name = ObjectClass.__table__.name num = insert_dataframe(data, table_name, sess.connection()) sess.commit() logger.info('{} records inserted to {}'.format(num, table_name))
def update_state_congr_table_census(census_file, sess): """ Update contents of state_congressional table to include districts from the census Args: census_file: file path/url to the census file to read sess: the database connection """ logger.info( "Adding congressional districts from census to the state_congressional table" ) data = pd.read_csv(census_file, dtype=str) model = StateCongressional data = clean_data( data, model, { "state_code": "state_code", "congressional_district_no": "congressional_district_no", "census_year": "census_year" }, {'congressional_district_no': { "pad_to_length": 2 }}) table_name = model.__table__.name insert_dataframe(data, table_name, sess.connection()) sess.commit()
def load_country_codes(base_path): """ Load Country Codes into the database. Args: base_path: directory that contains the domain values files. """ now = datetime.datetime.now() metrics_json = { 'script_name': 'load_country_codes.py', 'start_time': str(now), 'records_deleted': 0, 'records_provided': 0, 'duplicates_dropped': 0, 'records_inserted': 0 } if CONFIG_BROKER["use_aws"]: s3_client = boto3.client('s3', region_name=CONFIG_BROKER['aws_region']) filename = s3_client.generate_presigned_url( 'get_object', { 'Bucket': CONFIG_BROKER['sf_133_bucket'], 'Key': "country_codes.csv" }, ExpiresIn=600) else: filename = os.path.join(base_path, "country_codes.csv") logger.info('Loading country codes file: country_codes.csv') with create_app().app_context(): sess = GlobalDB.db().session # for object class, delete and replace values metrics_json['records_deleted'] = sess.query(CountryCode).delete() data = pd.read_csv(filename, dtype=str) metrics_json['records_provided'] = len(data.index) data = clean_data(data, CountryCode, { "country_code": "country_code", "country_name": "country_name" }, {}) # de-dupe data.drop_duplicates(subset=['country_code'], inplace=True) metrics_json['duplicates_dropped'] = metrics_json[ 'records_provided'] - len(data.index) # flag territories or freely associated states data["territory_free_state"] = np.where( data["country_code"].isin(TERRITORIES_FREE_STATES), True, False) # insert to db table_name = CountryCode.__table__.name num = insert_dataframe(data, table_name, sess.connection()) metrics_json['records_inserted'] = num sess.commit() logger.info('{} records inserted to {}'.format(num, table_name)) metrics_json['duration'] = str(datetime.datetime.now() - now) with open('load_country_codes_metrics.json', 'w+') as metrics_file: json.dump(metrics_json, metrics_file) logger.info("Script complete")
def clean_sam_data(data): """ Wrapper around clean_data with the DUNS context Args: data: the dataframe to be cleaned Returns: a cleaned/updated dataframe to be imported """ if not data.empty: return clean_data( data, DUNS, { "awardee_or_recipient_uniqu": "awardee_or_recipient_uniqu", "activation_date": "activation_date", "deactivation_date": "deactivation_date", "registration_date": "registration_date", "expiration_date": "expiration_date", "last_sam_mod_date": "last_sam_mod_date", "legal_business_name": "legal_business_name", "dba_name": "dba_name", "address_line_1": "address_line_1", "address_line_2": "address_line_2", "city": "city", "state": "state", "zip": "zip", "zip4": "zip4", "country_code": "country_code", "congressional_district": "congressional_district", "entity_structure": "entity_structure", "business_types_codes": "business_types_codes", "business_types": "business_types", "ultimate_parent_legal_enti": "ultimate_parent_legal_enti", "ultimate_parent_unique_ide": "ultimate_parent_unique_ide" }, {}) return data
def load_frec(file_name): """ Load FREC (high-level agency names) lookup table. Args: file_name: path/url to the file to be read """ sess = GlobalDB.db().session models = {frec.frec_code: frec for frec in sess.query(FREC)} # read FREC values from csv data = pd.read_csv(file_name, dtype=str) # clean data data = clean_data( data, FREC, {"frec": "frec_code", "cgac_agency_code": "cgac_code", "frec_entity_description": "agency_name", "agency_abbreviation": "agency_abbreviation"}, {"frec": {"keep_null": False}, "cgac_code": {"pad_to_length": 3}, "frec_code": {"pad_to_length": 4}} ) # de-dupe data.drop_duplicates(subset=['frec_code'], inplace=True) # create foreign key dicts cgac_dict = {str(cgac.cgac_code): cgac.cgac_id for cgac in sess.query(CGAC).filter(CGAC.cgac_code.in_(data["cgac_code"])).all()} # insert to db delete_missing_frecs(models, data) update_frecs(models, data, cgac_dict) sess.add_all(models.values()) sess.commit() logger.info('%s FREC records inserted', len(models))
def load_quarterly_threshold(): """ Loads the quarterly revalidation threshold data. """ if CONFIG_BROKER["use_aws"]: s3_client = boto3.client('s3', region_name=CONFIG_BROKER['aws_region']) threshold_file = s3_client.generate_presigned_url('get_object', {'Bucket': CONFIG_BROKER['sf_133_bucket'], 'Key': "quarterly_submission_dates.csv"}, ExpiresIn=600) else: threshold_file = os.path.join(CONFIG_BROKER["path"], "dataactvalidator", "config", "quarterly_submission_dates.csv") logger.info('Loading quarterly revalidation threshold data') with create_app().app_context(): data = pd.read_csv(threshold_file, dtype=str) data = clean_data( data, QuarterlyRevalidationThreshold, {"year": "year", "quarter": "quarter", "window_start": "window_start", "window_end": "window_end"}, {} ) sess = GlobalDB.db().session # delete any data in the QuarterlyRevalidationThreshold table sess.query(QuarterlyRevalidationThreshold).delete() # insert data into table num = insert_dataframe(data, QuarterlyRevalidationThreshold.__table__.name, sess.connection()) logger.info('{} records inserted to quarterly_revalidation_threshold'.format(num)) sess.commit()
def load_cgac(file_name): """ Load CGAC (high-level agency names) lookup table. Args: file_name: path/url to the file to be read """ sess = GlobalDB.db().session models = {cgac.cgac_code: cgac for cgac in sess.query(CGAC)} # read CGAC values from csv data = pd.read_csv(file_name, dtype=str) # clean data data = clean_data( data, CGAC, {"cgac_agency_code": "cgac_code", "agency_name": "agency_name", "agency_abbreviation": "agency_abbreviation"}, {"cgac_code": {"pad_to_length": 3}} ) # de-dupe data.drop_duplicates(subset=['cgac_code'], inplace=True) delete_missing_cgacs(models, data) update_cgacs(models, data) sess.add_all(models.values()) sess.commit() logger.info('%s CGAC records inserted', len(models))
def clean_sam_data(data, table=DUNS): """ Wrapper around clean_data with the DUNS context Args: data: the dataframe to be cleaned table: the table to work from (could be DUNS/HistoricParentDuns) Returns: a cleaned/updated dataframe to be imported """ return clean_data(data, table, { "awardee_or_recipient_uniqu": "awardee_or_recipient_uniqu", "activation_date": "activation_date", "deactivation_date": "deactivation_date", "registration_date": "registration_date", "expiration_date": "expiration_date", "last_sam_mod_date": "last_sam_mod_date", "sam_extract_code": "sam_extract_code", "legal_business_name": "legal_business_name", "dba_name": "dba_name", "address_line_1": "address_line_1", "address_line_2": "address_line_2", "city": "city", "state": "state", "zip": "zip", "zip4": "zip4", "country_code": "country_code", "congressional_district": "congressional_district", "entity_structure": "entity_structure", "business_types_codes": "business_types_codes", "ultimate_parent_legal_enti": "ultimate_parent_legal_enti", "ultimate_parent_unique_ide": "ultimate_parent_unique_ide" }, {})
def clean_office(csv_path): """ Read a CSV into a dataframe, then use a configured `clean_data` and return the results. Args: csv_path: path/url to the file to read from """ data = pd.read_csv(csv_path, dtype=str) data = clean_data( data, FPDSContractingOffice, { "department_id": "department_id", "department_name": "department_name", "agency_code": "agency_code", "agency_name": "agency_name", "contracting_office_code": "contracting_office_code", "contracting_office_name": "contracting_office_name", "start_date": "start_date", "end_date": "end_date", "address_city": "address_city", "address_state": "address_state", "zip_code": "zip_code", "country_code": "country_code" }, {"department_id": { "pad_to_length": 4 }}) return data.where(pd.notnull(data), None)
def clean_office(csv_path): """ Read a CSV into a dataframe, then use a configured `clean_data` and return the results. Args: csv_path: path/url to the file to read from """ data = pd.read_csv(csv_path, dtype=str) data = clean_data( data, FPDSContractingOffice, {"department_id": "department_id", "department_name": "department_name", "agency_code": "agency_code", "agency_name": "agency_name", "contracting_office_code": "contracting_office_code", "contracting_office_name": "contracting_office_name", "start_date": "start_date", "end_date": "end_date", "address_city": "address_city", "address_state": "address_state", "zip_code": "zip_code", "country_code": "country_code" }, {"department_id": {"pad_to_length": 4}} ) return data.where(pd.notnull(data), None)
def clean_sam_data(data, table=DUNS): """ Wrapper around clean_data with the DUNS context Args: data: the dataframe to be cleaned table: the table to work from (could be DUNS/HistoricParentDuns) Returns: a cleaned/updated dataframe to be imported """ return clean_data( data, table, { "awardee_or_recipient_uniqu": "awardee_or_recipient_uniqu", "activation_date": "activation_date", "deactivation_date": "deactivation_date", "registration_date": "registration_date", "expiration_date": "expiration_date", "last_sam_mod_date": "last_sam_mod_date", "sam_extract_code": "sam_extract_code", "legal_business_name": "legal_business_name", "dba_name": "dba_name", "address_line_1": "address_line_1", "address_line_2": "address_line_2", "city": "city", "state": "state", "zip": "zip", "zip4": "zip4", "country_code": "country_code", "congressional_district": "congressional_district", "business_types_codes": "business_types_codes", "ultimate_parent_legal_enti": "ultimate_parent_legal_enti", "ultimate_parent_unique_ide": "ultimate_parent_unique_ide" }, {})
def clean_sf133_data(filename, sf133_data): """ Clean up the data read in from the file. Args: filename: name/path of the file to be read sf133_data: Data model to clean against """ data = pd.read_csv(filename, dtype=str) data = clean_data( data, sf133_data, {"ata": "allocation_transfer_agency", "aid": "agency_identifier", "availability_type_code": "availability_type_code", "bpoa": "beginning_period_of_availa", "epoa": "ending_period_of_availabil", "main_account": "main_account_code", "sub_account": "sub_account_code", "fiscal_year": "fiscal_year", "period": "period", "line_num": "line", "amount_summed": "amount", "defc": "disaster_emergency_fund_code"}, {"allocation_transfer_agency": {"pad_to_length": 3}, "agency_identifier": {"pad_to_length": 3}, "main_account_code": {"pad_to_length": 4}, "sub_account_code": {"pad_to_length": 3}, # next 3 lines handle the TAS fields that shouldn't be padded but should still be empty spaces rather than # NULLs. this ensures that the downstream pivot & melt (which insert the missing 0-value SF-133 lines) will # work as expected (values used in the pivot index cannot be NULL). the "pad_to_length: 0" works around the # fact that sometimes the incoming data for these columns is a single space and sometimes it is blank/NULL. "beginning_period_of_availa": {"pad_to_length": 0}, "ending_period_of_availabil": {"pad_to_length": 0}, "availability_type_code": {"pad_to_length": 0}, "amount": {"strip_commas": True}} ) # todo: find out how to handle dup rows (e.g., same tas/period/line number) # line numbers 2002 and 2012 are the only duped SF 133 report line numbers, and they are not used by the validation # rules, so for now just remove them before loading our SF-133 table dupe_line_numbers = ['2002', '2102'] data = data[~data.line.isin(dupe_line_numbers)] # Uppercasing DEFC to save on indexing # Empty values are still empty strings ('') at this point data['disaster_emergency_fund_code'] = data['disaster_emergency_fund_code'].str.upper() data['disaster_emergency_fund_code'] = data['disaster_emergency_fund_code']. \ apply(lambda x: x.replace('QQQ', 'Q') if x else None) # add concatenated TAS field for internal use (i.e., joining to staging tables) data['tas'] = data.apply(lambda row: format_internal_tas(row), axis=1) data['display_tas'] = '' data['amount'] = data['amount'].astype(float) data = fill_blank_sf133_lines(data) return data
def load_object_class(base_path): """ This function loads Object classes into the database Args: base_path: directory that contains the domain values files. """ now = datetime.datetime.now() metrics_json = { 'script_name': 'load_object_class.py', 'start_time': str(now), 'records_received': 0, 'duplicates_dropped': 0, 'records_deleted': 0, 'records_inserted': 0 } if CONFIG_BROKER["use_aws"]: s3_client = boto3.client('s3', region_name=CONFIG_BROKER['aws_region']) filename = s3_client.generate_presigned_url( 'get_object', { 'Bucket': CONFIG_BROKER['sf_133_bucket'], 'Key': "object_class.csv" }, ExpiresIn=600) else: filename = os.path.join(base_path, "object_class.csv") # Load object class lookup table logger.info('Loading Object Class File: object_class.csv') with create_app().app_context(): sess = GlobalDB.db().session metrics_json['records_deleted'] = sess.query(ObjectClass).delete() data = pd.read_csv(filename, dtype=str) data = clean_data( data, ObjectClass, { "max_oc_code": "object_class_code", "max_object_class_name": "object_class_name" }, {"object_class_code": { "pad_to_length": 3 }}) metrics_json['records_received'] = len(data.index) # de-dupe data.drop_duplicates(subset=['object_class_code'], inplace=True) metrics_json['duplicates_dropped'] = metrics_json[ 'records_received'] - len(data.index) # insert to db table_name = ObjectClass.__table__.name num = insert_dataframe(data, table_name, sess.connection()) sess.commit() logger.info('{} records inserted to {}'.format(num, table_name)) metrics_json['records_inserted'] = num metrics_json['duration'] = str(datetime.datetime.now() - now) with open('load_object_class_metrics.json', 'w+') as metrics_file: json.dump(metrics_json, metrics_file)
def load_frec(file_name, force_reload=False): """ Load FREC (high-level agency names) lookup table. Args: file_name: path/url to the file to be read force_reload: whether to reload regardless Returns: True if new data was loaded, False if the load was skipped """ sess = GlobalDB.db().session models = {frec.frec_code: frec for frec in sess.query(FREC)} # read FREC values from csv data = pd.read_csv(file_name, dtype=str) # clean data data = clean_data( data, FREC, {'frec': 'frec_code', 'cgac_agency_code': 'cgac_code', 'frec_entity_description': 'agency_name', 'frec_abbreviation': 'frec_abbreviation', 'frec_cgac_association': 'frec_cgac', 'icon_filename': 'icon_name'}, {'frec': {'keep_null': False}, 'cgac_code': {'pad_to_length': 3}, 'frec_code': {'pad_to_length': 4}} ) data['icon_name'] = data['icon_name'].apply(clean_col, args=False) # de-dupe data = data[data.frec_cgac == 'TRUE'] data.drop(['frec_cgac'], axis=1, inplace=True) data.drop_duplicates(subset=['frec_code'], inplace=True) # create foreign key dicts cgac_dict = {str(cgac.cgac_code): cgac.cgac_id for cgac in sess.query(CGAC).filter(CGAC.cgac_code.in_(data['cgac_code'])).all()} cgac_dict_flipped = {cgac_id: cgac_code for cgac_code, cgac_id in cgac_dict.items()} # compare to existing content in table def extract_cgac(row): return cgac_dict_flipped[row['cgac_id']] if row['cgac_id'] in cgac_dict_flipped else np.nan diff_found = check_dataframe_diff(data, FREC, ['frec_id', 'cgac_id'], ['frec_code'], lambda_funcs=[('frec_abbreviation', extract_abbreviation), ('agency_name', extract_name), ('cgac_code', extract_cgac)]) if force_reload or diff_found: # create foreign key dicts # insert to db delete_missing_frecs(models, data) update_frecs(models, data, cgac_dict) sess.add_all(models.values()) sess.commit() logger.info('%s FREC records inserted', len(models)) return True else: logger.info('No differences found, skipping frec table reload.') return False
def load_object_class(base_path): """ This function loads Object classes into the database Args: base_path: directory that contains the domain values files. """ now = datetime.datetime.now() metrics_json = { 'script_name': 'load_object_class.py', 'start_time': str(now), 'records_received': 0, 'duplicates_dropped': 0, 'records_deleted': 0, 'records_inserted': 0 } filename = os.path.join(base_path, 'object_class.csv') try: # Update file from public S3 bucket object_class_url = '{}/object_class.csv'.format(CONFIG_BROKER['usas_public_reference_url']) r = requests.get(object_class_url, allow_redirects=True) open(filename, 'wb').write(r.content) except Exception: pass # Load object class lookup table logger.info('Loading Object Class File: object_class.csv') with create_app().app_context(): sess = GlobalDB.db().session metrics_json['records_deleted'] = sess.query(ObjectClass).delete() data = pd.read_csv(filename, dtype=str) data = clean_data( data, ObjectClass, {"max_oc_code": "object_class_code", "max_object_class_name": "object_class_name"}, {"object_class_code": {"pad_to_length": 3}} ) metrics_json['records_received'] = len(data.index) # de-dupe data.drop_duplicates(subset=['object_class_code'], inplace=True) metrics_json['duplicates_dropped'] = metrics_json['records_received'] - len(data.index) # insert to db table_name = ObjectClass.__table__.name num = insert_dataframe(data, table_name, sess.connection()) sess.commit() logger.info('{} records inserted to {}'.format(num, table_name)) metrics_json['records_inserted'] = num update_external_data_load_date(now, datetime.datetime.now(), 'object_class') metrics_json['duration'] = str(datetime.datetime.now() - now) with open('load_object_class_metrics.json', 'w+') as metrics_file: json.dump(metrics_json, metrics_file)
def clean_duns_csv_data(data): """ Simple wrapper around clean_data applied just for duns Args: data: dataframe representing the data to be cleaned Returns: a dataframe cleaned and to be imported to the database """ return clean_data(data, DUNS, column_mappings, {})
def run_duns_batches(file, sess, client, block_size=10000): """ Updates DUNS table in chunks from csv file Args: file: path to the DUNS export file to use sess: the database connection client: the connection to the SAM service block_size: the size of the batches to read from the DUNS export file. """ logger.info("Retrieving total rows from duns file") start = datetime.now() duns_reader_obj = pd.read_csv(file, skipinitialspace=True, header=None, quotechar='"', dtype=str, names=column_headers, iterator=True, chunksize=block_size, skiprows=1) duns_dfs = [duns_df for duns_df in duns_reader_obj] row_count = sum([len(duns_df.index) for duns_df in duns_dfs]) logger.info("Retrieved row count of {} in {} s".format( row_count, (datetime.now() - start).total_seconds())) duns_added = 0 for duns_df in duns_dfs: # Remove rows where awardee_or_recipient_uniqu is null duns_df = duns_df[duns_df['awardee_or_recipient_uniqu'].notnull()] # Ignore old DUNS we already have duns_to_load = remove_existing_duns(duns_df, sess) if not duns_to_load.empty: logger.info("Adding {} DUNS records from historic data".format( len(duns_to_load.index))) start = datetime.now() # get address info for incoming duns duns_to_load = update_duns_props(duns_to_load, client) duns_to_load = clean_data(duns_to_load, HistoricDUNS, column_mappings, {}) duns_added += len(duns_to_load.index) insert_dataframe(duns_to_load, HistoricDUNS.__table__.name, sess.connection()) sess.commit() logger.info("Finished updating {} DUNS rows in {} s".format( len(duns_to_load.index), (datetime.now() - start).total_seconds())) logger.info("Imported {} historical duns".format(duns_added))
def clean_sam_data(data): """ Wrapper around clean_data with the SAM Recipient context Args: data: the dataframe to be cleaned Returns: a cleaned/updated dataframe to be imported """ if not data.empty: column_mappings = {col: col for col in data.columns} return clean_data(data, SAMRecipient, column_mappings, {}) return data
def format_fabs_data(data): # drop all records without any data to be loaded data = data.replace('', np.nan, inplace=True) data.dropna(subset=["awarding office code", "awarding office name", "funding office name", "funding office code", "funding agency name", "funding agency code", "funding sub tier agency code", "funding sub tier agency name", "legal entity foreign city", "legal entity foreign province", "legal entity foreign postal code", "legal entity foreign location description"], inplace=True) # ensure there are rows to be cleaned and formatted if len(data.index) == 0: return None cdata = clean_data( data, PublishedAwardFinancialAssistance, { "agency_code": "awarding_sub_tier_agency_c", "federal_award_mod": "award_modification_amendme", "federal_award_id": "fain", "uri": "uri", "awarding office code": "awarding_office_code", "awarding office name": "awarding_office_name", "funding office name": "funding_office_name", "funding office code": "funding_office_code", "funding agency name": "funding_agency_name", "funding agency code": "funding_agency_code", "funding sub tier agency code": "funding_sub_tier_agency_co", "funding sub tier agency name": "funding_sub_tier_agency_na", "legal entity foreign city": "legal_entity_foreign_city", "legal entity foreign province": "legal_entity_foreign_provi", "legal entity foreign postal code": "legal_entity_foreign_posta", "legal entity foreign location description": "legal_entity_foreign_descr" }, {} ) # make a pass through the dataframe, changing any empty values to None, to ensure that those are represented as # NULL in the db. cdata = cdata.replace(np.nan, '', regex=True) cdata = cdata.applymap(lambda x: str(x).strip() if len(str(x).strip()) else None) # generate the afa_generated_unique field cdata['afa_generated_unique'] = cdata.apply(lambda x: generate_unique_string(x), axis=1) # drop columns in afa_generated_unique because we aren't updating them for col in ["awarding_sub_tier_agency_c", "award_modification_amendme", "fain", "uri"]: del cdata[col] return cdata
def load_defc(force_reload=False): """ Loads the DEFC data. Args: force_reload: boolean to determine if reload should happen whether there are differences or not """ start_time = datetime.now() defc_file = os.path.join(CONFIG_BROKER['path'], 'dataactvalidator', 'config', 'def_codes.csv') try: # Update file from public S3 bucket def_codes_url = '{}/def_codes.csv'.format( CONFIG_BROKER['usas_public_reference_url']) r = requests.get(def_codes_url, allow_redirects=True) open(defc_file, 'wb').write(r.content) except Exception: pass logger.info('Loading defc data') with create_app().app_context(): data = pd.read_csv(defc_file, dtype=str) # Remove all invalid DEFCs that have been left in the file so USAS can continue to display them correctly data = data[data['Is Valid'] == 'true'] data = clean_data(data, DEFC, { 'defc': 'code', 'group_name': 'group' }, {}) diff_found = check_dataframe_diff(data, DEFC, ['defc_id'], ['code']) if force_reload or diff_found: sess = GlobalDB.db().session # delete any data in the DEFC table sess.query(DEFC).delete() # insert data into table num = insert_dataframe(data, DEFC.__table__.name, sess.connection()) logger.info('{} records inserted to defc'.format(num)) sess.commit() update_external_data_load_date(start_time, datetime.now(), 'defc') else: logger.info('No differences found, skipping defc table reload.')
def load_submission_window_schedule(): """ Loads the submission window schedule data. """ if CONFIG_BROKER["use_aws"]: s3_client = boto3.client('s3', region_name=CONFIG_BROKER['aws_region']) sub_schedule_file = s3_client.generate_presigned_url( 'get_object', { 'Bucket': CONFIG_BROKER['sf_133_bucket'], 'Key': "submission_window_schedule.csv" }, ExpiresIn=600) else: sub_schedule_file = os.path.join(CONFIG_BROKER['path'], 'dataactvalidator', 'config', 'submission_window_schedule.csv') logger.info('Loading submission window schedule data') with create_app().app_context(): data = pd.read_csv(sub_schedule_file, dtype=str) data = clean_data( data, SubmissionWindowSchedule, { 'year': 'year', 'period': 'period', 'period_start': 'period_start', 'publish_deadline': 'publish_deadline', 'certification_deadline': 'certification_deadline' }, {}) # Add a day to the deadlines because the dates in the file are supposed to be inclusive data['publish_deadline'] = data.apply( lambda x: add_day(x, 'publish_deadline'), axis=1) data['certification_deadline'] = data.apply( lambda x: add_day(x, 'certification_deadline'), axis=1) sess = GlobalDB.db().session # delete any data in the SubmissionWindowSchedule table sess.query(SubmissionWindowSchedule).delete() # insert data into table num = insert_dataframe(data, SubmissionWindowSchedule.__table__.name, sess.connection()) logger.info( '{} records inserted to submission_window_schedule'.format(num)) sess.commit()
def load_sql(cls, filename): """ Load SQL-based validation rules to db. """ with create_app().app_context(): sess = GlobalDB.db().session filename = os.path.join(cls.sql_rules_path, filename) # Initial load sql_data = pd.read_csv(filename, dtype=str, usecols=cls.headers) sql_data = clean_data( sql_data, RuleSql, {'rule_label': 'rule_label', 'rule_error_message': 'rule_error_message', 'query_name': 'query_name', 'expected_value': 'expected_value', 'category': 'category', 'file_type': 'file_type', 'target_file': 'target_file', 'rule_cross_file_flag': 'rule_cross_file_flag', 'severity_name': 'severity_name'}, {} ) # Processing certain values sql_data['rule_sql'] = sql_data['query_name'].apply(lambda name: cls.read_sql_str(name)) sql_data['file_id'] = sql_data['file_type'].apply(lambda type: FILE_TYPE_DICT.get(type, None)) if sql_data['file_id'].isnull().values.any(): raise Exception('Invalid file_type value found in sqlLoader. Must be one of the following: {}' .format(', '.join(list(FILE_TYPE_DICT.keys())))) sql_data['target_file_id'] = sql_data['target_file'].apply(lambda type: FILE_TYPE_DICT.get(type, None)) sql_data['rule_cross_file_flag'] = sql_data['rule_cross_file_flag'].apply(lambda flag: flag in ('true', 't', 'y', 'yes')) sql_data['rule_severity_id'] = sql_data['severity_name'].apply(lambda severity_name: RULE_SEVERITY_DICT.get(severity_name, None)) if sql_data['rule_severity_id'].isnull().values.any(): raise Exception('Invalid severity_name value found in sqlLoader Must be one of the following: {}' .format(', '.join(list(RULE_SEVERITY_DICT.keys())))) sql_data.drop(['file_type', 'severity_name', 'target_file'], axis=1, inplace=True) # Final check if we need to actually reload if check_dataframe_diff(sql_data, RuleSql, del_cols=['rule_sql_id', 'created_at', 'updated_at'], sort_cols=['rule_label', 'file_id', 'target_file_id']): # Delete and reload all records currently in table logger.info('Detected changes in {}, deleting RuleSQL and reloading'.format(cls.sql_rules_path)) sess.query(RuleSql).delete() insert_dataframe(sql_data, RuleSql.__table__.name, sess.connection()) sess.commit() else: logger.info('No changes detected since last load. Skipping.')
def load_cgac(file_name, force_reload=False): """ Load CGAC (high-level agency names) lookup table. Args: file_name: path/url to the file to be read force_reload: whether to reload regardless Returns: True if new data was loaded, False if the load was skipped """ sess = GlobalDB.db().session models = {cgac.cgac_code: cgac for cgac in sess.query(CGAC)} # read CGAC values from csv data = pd.read_csv(file_name, dtype=str) # clean data data = clean_data( data, CGAC, {'cgac_agency_code': 'cgac_code', 'agency_name': 'agency_name', 'agency_abbreviation': 'agency_abbreviation', 'icon_filename': 'icon_name'}, {'cgac_code': {'pad_to_length': 3}} ) data['icon_name'] = data['icon_name'].apply(clean_col, args=False) # de-dupe data.drop_duplicates(subset=['cgac_code'], inplace=True) # compare to existing content in table diff_found = check_dataframe_diff(data, CGAC, ['cgac_id'], ['cgac_code'], lambda_funcs=[('agency_abbreviation', extract_abbreviation), ('agency_name', extract_name)]) if force_reload or diff_found: delete_missing_cgacs(models, data) update_cgacs(models, data) sess.add_all(models.values()) sess.commit() logger.info('%s CGAC records inserted', len(models)) return True else: logger.info('No differences found, skipping cgac table reload.') return False
def clean_tas(csv_path): """ Read a CSV into a dataframe, then use a configured `clean_data` and return the results Args: csv_path: path of the car_tas csv to import Returns: pandas dataframe of clean data imported from the cars_tas csv """ # Encoding accounts for cases where a column may include '\ufeff' data = pd.read_csv(csv_path, dtype=str, encoding='utf-8-sig') for column_mappings in [current_mappings, original_mappings]: try: data = clean_data( data, TASLookup, column_mappings, {"allocation_transfer_agency": {"pad_to_length": 3, "keep_null": True}, "agency_identifier": {"pad_to_length": 3}, # Account for " " cells "availability_type_code": {"pad_to_length": 0, "keep_null": True}, "beginning_period_of_availa": {"pad_to_length": 0, "keep_null": True}, "ending_period_of_availabil": {"pad_to_length": 0, "keep_null": True}, "main_account_code": {"pad_to_length": 4}, "sub_account_code": {"pad_to_length": 3}, "budget_function_code": {"pad_to_length": 3, "keep_null": True}, "budget_subfunction_code": {"pad_to_length": 3, "keep_null": True}, "budget_bureau_code": {"pad_to_length": 2, "keep_null": True}, "reporting_agency_aid": {"pad_to_length": 3, "keep_null": True} } ) break except ValueError as e: if column_mappings != original_mappings: logger.info('Mismatched columns, trying again with different column set') else: logger.error('Encountered new column set: {}'.format(data.columns)) raise e # Drop all but the last instance of each account number data = data[~data.duplicated(subset=['account_num'], keep='last')] data["account_num"] = pd.to_numeric(data['account_num']) return data.where(pd.notnull(data), None)
def update_state_congr_table_census(census_file, sess): logger.info( "Adding congressional districtions from census to the state_congressional table" ) data = pd.read_csv(census_file, dtype=str) model = StateCongressional data = clean_data( data, model, { "state_code": "state_code", "congressional_district_no": "congressional_district_no", "census_year": "census_year" }, {'congressional_district_no': { "pad_to_length": 2 }}) table_name = model.__table__.name insert_dataframe(data, table_name, sess.connection()) sess.commit()
def load_sub_tier_agencies(file_name): """ Load Sub Tier Agency (sub_tier-level agency names) lookup table. Args: file_name: path/url to the file to be read """ sess = GlobalDB.db().session models = {sub_tier_agency.sub_tier_agency_code: sub_tier_agency for sub_tier_agency in sess.query(SubTierAgency)} # read Sub Tier Agency values from csv data = pd.read_csv(file_name, dtype=str) condition = data["FPDS DEPARTMENT ID"] == data["SUBTIER CODE"] data.loc[condition, "PRIORITY"] = 1 data.loc[~condition, "PRIORITY"] = 2 # clean data data = clean_data( data, SubTierAgency, {"cgac_agency_code": "cgac_code", "subtier_code": "sub_tier_agency_code", "priority": "priority", "frec": "frec_code", "subtier_name": "sub_tier_agency_name", "is_frec": "is_frec"}, {"cgac_code": {"pad_to_length": 3}, "frec_code": {"pad_to_length": 4}, "sub_tier_agency_code": {"pad_to_length": 4}} ) # de-dupe data.drop_duplicates(subset=['sub_tier_agency_code'], inplace=True) # create foreign key dicts cgac_dict = {str(cgac.cgac_code): cgac.cgac_id for cgac in sess.query(CGAC).filter(CGAC.cgac_code.in_(data["cgac_code"])).all()} frec_dict = {str(frec.frec_code): frec.frec_id for frec in sess.query(FREC).filter(FREC.frec_code.in_(data["frec_code"])).all()} delete_missing_sub_tier_agencies(models, data) update_sub_tier_agencies(models, data, cgac_dict, frec_dict) sess.add_all(models.values()) sess.commit() logger.info('%s Sub Tier Agency records inserted', len(models))
def load_country_codes(base_path): """ Load Country Codes into the database. Args: base_path: directory that contains the domain values files. """ if CONFIG_BROKER["use_aws"]: s3_client = boto3.client('s3', region_name=CONFIG_BROKER['aws_region']) filename = s3_client.generate_presigned_url('get_object', {'Bucket': CONFIG_BROKER['sf_133_bucket'], 'Key': "country_codes.csv"}, ExpiresIn=600) else: filename = os.path.join(base_path, "country_codes.csv") logger.info('Loading country codes file: country_codes.csv') with create_app().app_context(): sess = GlobalDB.db().session # for object class, delete and replace values sess.query(CountryCode).delete() data = pd.read_csv(filename, dtype=str) data = clean_data( data, CountryCode, {"country_code": "country_code", "country_name": "country_name"}, {} ) # de-dupe data.drop_duplicates(subset=['country_code'], inplace=True) # insert to db table_name = CountryCode.__table__.name num = insert_dataframe(data, table_name, sess.connection()) sess.commit() logger.info('{} records inserted to {}'.format(num, table_name))
def update_state_congr_table_census(census_file, sess): """ Update contents of state_congressional table to include districts from the census Args: census_file: file path/url to the census file to read sess: the database connection """ logger.info("Adding congressional districts from census to the state_congressional table") data = pd.read_csv(census_file, dtype=str) model = StateCongressional data = clean_data( data, model, {"state_code": "state_code", "congressional_district_no": "congressional_district_no", "census_year": "census_year"}, {'congressional_district_no': {"pad_to_length": 2}} ) table_name = model.__table__.name insert_dataframe(data, table_name, sess.connection()) sess.commit()
def clean_tas(csv_path, metrics=None): """ Read a CSV into a dataframe, then use a configured `clean_data` and return the results Args: csv_path: path of the car_tas csv to import metrics: an object containing information for the metrics file Returns: pandas dataframe of clean data imported from the cars_tas csv """ if not metrics: metrics = {'records_provided': 0, 'duplicates_dropped': 0} # Encoding accounts for cases where a column may include '\ufeff' data = pd.read_csv(csv_path, dtype=str, encoding='utf-8-sig') metrics['records_provided'] += len(data.index) for column_mappings in [current_mappings, original_mappings]: try: data = clean_data( data, TASLookup, column_mappings, { "allocation_transfer_agency": { "pad_to_length": 3, "keep_null": True }, "agency_identifier": { "pad_to_length": 3 }, # Account for " " cells "availability_type_code": { "pad_to_length": 0, "keep_null": True }, "beginning_period_of_availa": { "pad_to_length": 0, "keep_null": True }, "ending_period_of_availabil": { "pad_to_length": 0, "keep_null": True }, "main_account_code": { "pad_to_length": 4 }, "sub_account_code": { "pad_to_length": 3 }, "budget_function_code": { "pad_to_length": 3, "keep_null": True }, "budget_subfunction_code": { "pad_to_length": 3, "keep_null": True }, "budget_bureau_code": { "pad_to_length": 2, "keep_null": True }, "reporting_agency_aid": { "pad_to_length": 3, "keep_null": True } }) break except ValueError as e: if column_mappings != original_mappings: logger.info( 'Mismatched columns, trying again with different column set' ) else: logger.error('Encountered new column set: {}'.format( data.columns)) raise e # Drop all but the last instance of each account number data = data[~data.duplicated(subset=['account_num'], keep='last')] metrics['duplicates_dropped'] += metrics['records_provided'] - len( data.index) data["account_num"] = pd.to_numeric(data['account_num']) return data.where(pd.notnull(data), None)
def load_cfda_program(base_path): """ Load cfda program. Args: base_path: directory that contains the cfda values files. """ if CONFIG_BROKER["use_aws"]: s3_client = boto3.client('s3', region_name=CONFIG_BROKER['aws_region']) filename = s3_client.generate_presigned_url( 'get_object', { 'Bucket': CONFIG_BROKER['sf_133_bucket'], 'Key': "cfda_program.csv" }, ExpiresIn=600) else: filename = os.path.join(base_path, "cfda_program.csv") logger.info('Loading CFDA program file: ' + "cfda_program.csv") """Load country code lookup table.""" model = CFDAProgram with create_app().app_context(): configure_logging() sess = GlobalDB.db().session # for object class, delete and replace values sess.query(model).delete() data = pd.read_csv(filename, dtype=str, encoding='latin1') data = clean_data( data, model, { "program_title": "program_title", "program_number": "program_number", "popular_name_(020)": "popular_name", "federal_agency_(030)": "federal_agency", "authorization_(040)": "authorization", "objectives_(050)": "objectives", "types_of_assistance_(060)": "types_of_assistance", "uses_and_use_restrictions_(070)": "uses_and_use_restrictions", "applicant_eligibility_(081)": "applicant_eligibility", "beneficiary_eligibility_(082)": "beneficiary_eligibility", "credentials/documentation_(083)": "credentials_documentation", "preapplication_coordination_(091)": "preapplication_coordination", "application_procedures_(092)": "application_procedures", "award_procedure_(093)": "award_procedure", "deadlines_(094)": "deadlines", "range_of_approval/disapproval_time_(095)": "range_of_approval_disapproval_time", "appeals_(096)": "appeals", "renewals_(097)": "renewals", "formula_and_matching_requirements_(101)": "formula_and_matching_requirements", "length_and_time_phasing_of_assistance_(102)": "length_and_time_phasing_of_assistance", "reports_(111)": "reports", "audits_(112)": "audits", "records_(113)": "records", "account_identification_(121)": "account_identification", "obligations_(122)": "obligations", "range_and_average_of_financial_assistance_(123)": "range_and_average_of_financial_assistance", "program_accomplishments_(130)": "program_accomplishments", "regulations__guidelines__and_literature_(140)": "regulations_guidelines_and_literature", "regional_or__local_office_(151)": "regional_or_local_office", "headquarters_office_(152)": "headquarters_office", "website_address_(153)": "website_address", "related_programs_(160)": "related_programs", "examples_of_funded_projects_(170)": "examples_of_funded_projects", "criteria_for_selecting_proposals_(180)": "criteria_for_selecting_proposals", "url": "url", "recovery": "recovery", "omb_agency_code": "omb_agency_code", "omb_bureau_code": "omb_bureau_code", "published_date": "published_date", "archived_date": "archived_date" }, {}) data["published_date"] = format_date(data["published_date"]) data["archived_date"] = format_date(data["archived_date"]) # insert to db table_name = model.__table__.name num = insert_dataframe(data, table_name, sess.connection()) sess.commit() logger.info('{} records inserted to {}'.format(num, table_name))
def load_cfda_program(base_path, load_local=False, local_file_name="cfda_program.csv"): """ Load cfda program. Args: base_path: directory that contains the cfda values files. """ if not load_local: logger.info("Fetching CFDA file from {}".format(S3_CFDA_FILE)) tmp_name = str(time.time()).replace(".", "") + "_cfda_program.csv" filename = os.path.join(base_path, tmp_name) r = requests.get(S3_CFDA_FILE, allow_redirects=True) open(filename, 'wb').write(r.content) else: filename = os.path.join(base_path, local_file_name) logger.info('Loading CFDA program file: ' + filename) """Load country code lookup table.""" model = CFDAProgram def fix_program_number(n, decimals=3): multiplier = 10 ** decimals value = math.floor(n * multiplier + 0.5) / multiplier return str(value).ljust(6, '0') with create_app().app_context(): configure_logging() sess = GlobalDB.db().session now = datetime.utcnow() import_data = pd.read_csv(filename, dtype=str, encoding='cp1252', na_filter=False) import_data = clean_data( import_data, model, DATA_CLEANING_MAP, {} ) import_data["published_date"] = format_date(import_data["published_date"]) import_data["archived_date"] = format_date(import_data["archived_date"]) import_dataframe = import_data.copy(deep=True) # To do the comparison, first we need to mock the pk column that postgres creates. We'll set it universally to 1 import_dataframe = import_dataframe.assign(cfda_program_id=1, created_at=now, updated_at=now) table_name = model.__table__.name current_data = pd.read_sql_table(table_name, sess.connection(), coerce_float=False) # Now we need to overwrite the db's audit dates in the created dataframe, and # also set all the pks to 1, so they match current_data = current_data.assign(cfda_program_id=1, created_at=now, updated_at=now) # pandas comparison requires everything to be in the same order current_data.sort_values('program_number', inplace=True) import_dataframe.sort_values('program_number', inplace=True) # columns too cols = import_dataframe.columns.tolist() cols.sort() import_dataframe = import_dataframe[cols] cols = current_data.columns.tolist() cols.sort() current_data = current_data[cols] # need to reset the indexes now that we've done all this sorting, so that they match import_dataframe.reset_index(drop=True, inplace=True) current_data.reset_index(drop=True, inplace=True) # My favorite part: When pandas pulls the data out of postgres, the program_number column # is a Decimal. However, in adding it to the dataframe, this column loses precision. # So for example, a program number of 10.001 imports into the dataframe as 10.000999999999999. # It also needs to be cast to astring, and padded with the right number of zeroes, as needed. current_data['program_number'] = current_data['program_number'].apply(lambda x: fix_program_number(x)) # Finally, you can execute this and get True back if the data truly has not changed from the last # time the CSV was loaded. new_data = not import_dataframe.equals(current_data) if new_data: # insert to db sess.query(model).delete() num = insert_dataframe(import_data, table_name, sess.connection()) sess.commit() if not load_local: os.remove(filename) if new_data: logger.info('{} records inserted to {}'.format(num, table_name)) else: logger.info("Skipped cfda load, no new data.") sys.exit(3)
def format_fabs_data(data, sess, fips_state_list, state_code_list, sub_tier_list, county_code_list): logger.info("formatting data") # drop rows with null FAIN and URI data = data[~((data['federal_award_id'].isnull()) & (data['uri'].isnull()))].copy() if len(data.index) == 0: return None proper_casing_cols = ['recipient_name', 'recipient_city_name', 'recipient_county_name', 'receip_addr1', 'receip_addr2', 'receip_addr3'] for col in proper_casing_cols: data[col] = data.apply(lambda x: format_proper_casing(x, col), axis=1) cols_with_colons = ['action_type', 'assistance_type', 'agency_code', 'recipient_type', 'correction_late_ind'] for col in cols_with_colons: data[col] = data.apply(lambda x: remove_data_after_colon(x, col), axis=1) # data['recipient_city_code'] = data.apply(lambda x: format_integer_code(x, 'recipient_city_code', 5), axis=1) data['recipient_county_code'] = data.apply(lambda x: format_integer_code(x, 'recipient_county_code', 3), axis=1) data['legal_entity_zip5'] = data.apply(lambda x: format_zip_five(x), axis=1) data['legal_entity_zip_last4'] = data.apply(lambda x: format_zip_four(x), axis=1) data['total_funding_amount'] = data.apply(lambda x: format_total_funding(x), axis=1) data['starting_date'] = data.apply(lambda x: format_date(x, 'starting_date'), axis=1) data['ending_date'] = data.apply(lambda x: format_date(x, 'ending_date'), axis=1) data['record_type'] = data.apply(lambda x: format_record_type(x), axis=1) data['principal_place_zip'] = data.apply(lambda x: format_full_zip(x), axis=1) data['principal_place_cd'] = data.apply(lambda x: format_cd(x, 'principal_place_cd'), axis=1) data['recipient_cd'] = data.apply(lambda x: format_cd(x, 'recipient_cd'), axis=1) data['is_historical'] = np.full(len(data.index), True, dtype=bool) logger.info("Starting derive_legal_entity_city_code") data['legal_entity_city_code'] = data.apply(lambda x: derive_legal_entity_city_code(x, sess), axis=1) logger.info("Starting derive_awarding_agency_code") data['awarding_agency_code'] = data.apply(lambda x: derive_awarding_agency_code(x, sub_tier_list), axis=1) logger.info("Starting derive_awarding_agency_name") data['awarding_agency_name'] = data.apply(lambda x: derive_awarding_agency_name(x, sub_tier_list), axis=1) logger.info("Starting derive_awarding_sub_tier_agency_n") data['awarding_sub_tier_agency_n'] = data.apply(lambda x: derive_awarding_sub_tier_agency_n(x, sub_tier_list), axis=1) logger.info("Starting derive_place_of_perform_county_na") data['place_of_perform_county_na'] = data.apply(lambda x: derive_place_of_perform_county_na(x, sess, fips_state_list, state_code_list, county_code_list), axis=1) logger.info("Starting derive_place_of_performance_city") data['place_of_perform_city'] = data.apply(lambda x: derive_place_of_performance_city(x, sess, fips_state_list, state_code_list), axis=1) logger.info("Starting derive_legal_entity_state_name") data['legal_entity_state_name'] = data.apply(lambda x: derive_legal_entity_state_name(x, sess, fips_state_list, state_code_list), axis=1) logger.info("Finished derivations") # adding columns missing from historical data null_list = [ 'awarding_office_code', 'awarding_office_name', 'funding_office_name', 'funding_agency_name', 'funding_agency_code', 'funding_office_code', 'funding_sub_tier_agency_co', 'funding_sub_tier_agency_na', 'legal_entity_foreign_city', 'legal_entity_foreign_posta', 'legal_entity_foreign_provi', 'place_of_performance_forei' ] for item in null_list: data[item] = None cdata = clean_data( data, PublishedAwardFinancialAssistance, { 'obligation_action_date': 'action_date', 'action_type': 'action_type', 'assistance_type': 'assistance_type', 'project_description': 'award_description', 'recipient_name': 'awardee_or_recipient_legal', 'duns_no': 'awardee_or_recipient_uniqu', 'awarding_agency_code': 'awarding_agency_code', 'awarding_agency_name': 'awarding_agency_name', 'awarding_office_code': 'awarding_office_code', 'awarding_office_name': 'awarding_office_name', 'agency_code': 'awarding_sub_tier_agency_c', 'awarding_sub_tier_agency_n': 'awarding_sub_tier_agency_n', 'federal_award_mod': 'award_modification_amendme', 'rec_flag': 'business_funds_indicator', 'recipient_type': 'business_types', 'cfda_program_num': 'cfda_number', 'cfda_program_title': 'cfda_title', 'correction_late_ind': 'correction_delete_indicatr', 'face_loan_guran': 'face_value_loan_guarantee', 'federal_award_id': 'fain', 'fed_funding_amount': 'federal_action_obligation', 'fyq_correction': 'fiscal_year_and_quarter_co', 'funding_agency_name': 'funding_agency_name', 'funding_agency_code': 'funding_agency_code', 'funding_office_code': 'funding_office_code', 'funding_office_name': 'funding_office_name', 'funding_sub_tier_agency_co': 'funding_sub_tier_agency_co', 'funding_sub_tier_agency_na': 'funding_sub_tier_agency_na', 'is_historical': 'is_historical', 'receip_addr1': 'legal_entity_address_line1', 'receip_addr2': 'legal_entity_address_line2', 'receip_addr3': 'legal_entity_address_line3', 'legal_entity_city_code': 'legal_entity_city_code', 'recipient_city_name': 'legal_entity_city_name', 'recipient_cd': 'legal_entity_congressional', 'recipient_country_code': 'legal_entity_country_code', 'recipient_county_code': 'legal_entity_county_code', 'recipient_county_name': 'legal_entity_county_name', 'legal_entity_foreign_city': 'legal_entity_foreign_city', 'legal_entity_foreign_posta': 'legal_entity_foreign_posta', 'legal_entity_foreign_provi': 'legal_entity_foreign_provi', 'recipient_state_code': 'legal_entity_state_code', 'legal_entity_zip5': 'legal_entity_zip5', 'legal_entity_state_name': 'legal_entity_state_name', 'legal_entity_zip_last4': 'legal_entity_zip_last4', 'last_modified_date': 'modified_at', 'non_fed_funding_amount': 'non_federal_funding_amount', 'orig_sub_guran': 'original_loan_subsidy_cost', 'ending_date': 'period_of_performance_curr', 'starting_date': 'period_of_performance_star', 'principal_place_code': 'place_of_performance_code', 'principal_place_cd': 'place_of_performance_congr', 'principal_place_country_code': 'place_of_perform_country_c', 'place_of_performance_forei': 'place_of_performance_forei', 'principal_place_zip': 'place_of_performance_zip4a', 'place_of_perform_city': 'place_of_performance_city', 'place_of_perform_county_na': 'place_of_perform_county_na', 'principal_place_state': 'place_of_perform_state_nam', 'record_type': 'record_type', 'sai_number': 'sai_number', 'total_funding_amount': 'total_funding_amount', 'uri': 'uri' }, { 'place_of_performance_congr': {'pad_to_length': 2, 'keep_null': True}, 'legal_entity_congressional': {'pad_to_length': 2, 'keep_null': True} } ) # make a pass through the dataframe, changing any empty values to None, to ensure that those are represented as # NULL in the db. cdata = cdata.replace(np.nan, '', regex=True) cdata = cdata.applymap(lambda x: str(x).strip() if len(str(x).strip()) else None) # generate the afa_generated_unique field cdata['afa_generated_unique'] = cdata.apply(lambda x: generate_unique_string(x), axis=1) return cdata
def clean_tas(csv_path): """ Read a CSV into a dataframe, then use a configured `clean_data` and return the results Args: csv_path: path of the car_tas csv to import Returns: pandas dataframe of clean data imported from the cars_tas csv """ # Encoding accounts for cases where a column may include '\ufeff' data = pd.read_csv(csv_path, dtype=str, encoding='utf-8-sig') for column_mappings in [current_mappings, original_mappings]: try: data = clean_data( data, TASLookup, column_mappings, { "allocation_transfer_agency": { "pad_to_length": 3, "keep_null": True }, "agency_identifier": { "pad_to_length": 3 }, # Account for " " cells "availability_type_code": { "pad_to_length": 0, "keep_null": True }, "beginning_period_of_availa": { "pad_to_length": 0, "keep_null": True }, "ending_period_of_availabil": { "pad_to_length": 0, "keep_null": True }, "main_account_code": { "pad_to_length": 4 }, "sub_account_code": { "pad_to_length": 3 }, "budget_function_code": { "pad_to_length": 3, "keep_null": True }, "budget_subfunction_code": { "pad_to_length": 3, "keep_null": True }, "budget_bureau_code": { "pad_to_length": 2, "keep_null": True }, "reporting_agency_aid": { "pad_to_length": 3, "keep_null": True } }) break except ValueError as e: if column_mappings != original_mappings: logger.info( 'Mismatched columns, trying again with different column set' ) else: logger.error('Encountered new column set: {}'.format( data.columns)) raise e data["account_num"] = pd.to_numeric(data['account_num']) return data.where(pd.notnull(data), None)
def parse_exec_comp_file(file_path, metrics=None): """ Parses the executive compensation file to update corresponding SAM records Args: file_path: the path to the SAM file metrics: dictionary representing metrics of the script Raises: Exception: couldn't extract the last exec comp modification date, this generally means the filename provided doesn't match the expected format. Returns: dataframe representing the contents in the file """ if not metrics: metrics = { 'files_processed': [], 'records_received': 0, 'records_processed': 0 } logger.info('Starting file ' + file_path) file_name = os.path.splitext(os.path.basename(file_path))[0] dat_file_name = file_name + '.dat' file_name_props = file_name.split('_') dat_file_date = file_name_props[-1] version = 'v2' if 'V2' in file_name else 'v1' period = file_name_props[3] zfile = zipfile.ZipFile(file_path) v1_column_header_mapping = { 'awardee_or_recipient_uniqu': 0, 'sam_extract_code': 4, 'exec_comp_str': 89 } v2_column_header_mapping = { 'uei': 0, 'awardee_or_recipient_uniqu': 1, 'sam_extract_code': 5, 'exec_comp_str': 91 } column_header_mapping = v1_column_header_mapping if version == 'v1' else v2_column_header_mapping column_header_mapping_ordered = OrderedDict(sorted(column_header_mapping.items(), key=lambda c: c[1])) key_col = 'awardee_or_recipient_uniqu' if version == 'v1' else 'uei' # can't use skipfooter, pandas' c engine doesn't work with skipfooter and the python engine doesn't work with dtype nrows = 0 with zfile.open(dat_file_name) as dat_file: nrows = len(dat_file.readlines()) - 2 # subtract the header and footer with zfile.open(dat_file_name) as dat_file: csv_data = pd.read_csv(dat_file, dtype=str, header=None, skiprows=1, nrows=nrows, sep='|', usecols=column_header_mapping_ordered.values(), names=column_header_mapping_ordered.keys(), quoting=3) total_data = csv_data.copy() records_received = len(total_data.index) # trimming all columns before cleaning to ensure the sam_extract is working as intended total_data = total_data.applymap(lambda x: trim_item(x) if len(str(x).strip()) else None) total_data = total_data[total_data[key_col].notnull() & total_data['sam_extract_code'].isin(['2', '3', 'A', 'E'])] records_processed = len(total_data.index) del total_data['sam_extract_code'] # drop SAM duplicates, taking only the last one keep = 'first' if period == 'MONTHLY' else 'last' total_data.drop_duplicates(subset=[key_col], keep=keep, inplace=True) # Note: we're splitting these up cause it vastly saves memory parsing only the records that are populated blank_exec = total_data[total_data['exec_comp_str'].isnull()] pop_exec = total_data[total_data['exec_comp_str'].notnull()] # parse out executive compensation from row 90 for populated records if not pop_exec.empty: lambda_func = (lambda ecs: pd.Series(list(parse_exec_comp(ecs).values()))) parsed_data = pop_exec['exec_comp_str'].apply(lambda_func) parsed_data.columns = list(parse_exec_comp().keys()) pop_exec = pd.concat([pop_exec, parsed_data], axis=1) else: pop_exec = pop_exec.assign(**parse_exec_comp()) # leave blanks blank_exec = blank_exec.assign(**parse_exec_comp()) # setup the final dataframe total_data = pd.concat([pop_exec, blank_exec]) del total_data['exec_comp_str'] total_data.replace('', np.nan, inplace=True) last_exec_comp_mod_date = datetime.datetime.strptime(dat_file_date, '%Y%m%d').date() total_data = total_data.assign(last_exec_comp_mod_date=last_exec_comp_mod_date) # Cleaning out any untrimmed strings if not total_data.empty: exec_comp_maps = { 'awardee_or_recipient_uniqu': 'awardee_or_recipient_uniqu', 'high_comp_officer1_amount': 'high_comp_officer1_amount', 'high_comp_officer1_full_na': 'high_comp_officer1_full_na', 'high_comp_officer2_amount': 'high_comp_officer2_amount', 'high_comp_officer2_full_na': 'high_comp_officer2_full_na', 'high_comp_officer3_amount': 'high_comp_officer3_amount', 'high_comp_officer3_full_na': 'high_comp_officer3_full_na', 'high_comp_officer4_amount': 'high_comp_officer4_amount', 'high_comp_officer4_full_na': 'high_comp_officer4_full_na', 'high_comp_officer5_amount': 'high_comp_officer5_amount', 'high_comp_officer5_full_na': 'high_comp_officer5_full_na', 'last_exec_comp_mod_date': 'last_exec_comp_mod_date' } if version == 'v2': exec_comp_maps['uei'] = 'uei' total_data = clean_data(total_data, SAMRecipient, exec_comp_maps, {}) total_data.drop(columns=['created_at', 'updated_at'], inplace=True) metrics['files_processed'].append(dat_file_name) metrics['records_received'] += records_received metrics['records_processed'] += records_processed return total_data
def load_cfda_program(base_path, load_local=False, local_file_name="cfda_program.csv"): """ Load cfda program. Args: base_path: directory that contains the cfda values files. load_local: boolean indicating whether to load from a local file or not local_file_name: the name of the file if loading locally """ local_now = datetime.now() if not load_local: logger.info("Fetching CFDA file from {}".format(S3_CFDA_FILE)) tmp_name = str(time.time()).replace(".", "") + "_cfda_program.csv" filename = os.path.join(base_path, tmp_name) r = requests.get(S3_CFDA_FILE, allow_redirects=True) open(filename, 'wb').write(r.content) else: filename = os.path.join(base_path, local_file_name) logger.info('Loading CFDA program file: ' + filename) model = CFDAProgram metrics_json = { 'script_name': 'load_cfda_data.py', 'start_time': str(local_now), 'new_records': 0 } def fix_program_number(row, decimals=3): multiplier = 10 ** decimals value = math.floor(row['program_number'] * multiplier + 0.5) / multiplier return str(value).ljust(6, '0') with create_app().app_context(): configure_logging() sess = GlobalDB.db().session import_data = pd.read_csv(filename, dtype=str, encoding='cp1252', na_filter=False) import_data = clean_data( import_data, model, DATA_CLEANING_MAP, {} ) import_data["published_date"] = format_date(import_data["published_date"]) import_data["archived_date"] = format_date(import_data["archived_date"]) table_name = model.__table__.name # Check if there is new data to load new_data = check_dataframe_diff(import_data, model, ['cfda_program_id'], ['program_number'], lambda_funcs=[('program_number', fix_program_number)]) if new_data: # insert to db sess.query(model).delete() num = insert_dataframe(import_data, table_name, sess.connection()) sess.commit() # If we've updated the data at all, update the external data load date update_external_data_load_date(local_now, datetime.now(), 'cfda') if not load_local: os.remove(filename) if new_data: logger.info('{} records inserted to {}'.format(num, table_name)) metrics_json['new_records'] = num else: logger.info("Skipped cfda load, no new data.") sys.exit(3) metrics_json['duration'] = str(datetime.now() - local_now) with open('load_cfda_data_metrics.json', 'w+') as metrics_file: json.dump(metrics_json, metrics_file)
def load_program_activity_data(base_path, force_reload=False, export=False): """ Load program activity lookup table. Args: base_path: directory of domain config files force_reload: whether or not to force a reload export: whether or not to export a public copy of the file """ now = datetime.datetime.now() metrics_json = { 'script_name': 'load_program_activity.py', 'start_time': str(now), 'records_received': 0, 'duplicates_dropped': 0, 'invalid_records_dropped': 0, 'records_deleted': 0, 'records_inserted': 0 } dropped_count = 0 logger.info('Checking PA upload dates to see if we can skip.') last_upload = get_date_of_current_pa_upload(base_path) if not (last_upload > get_stored_pa_last_upload()) and not force_reload: logger.info('Skipping load as it\'s already been done') else: logger.info('Getting the progrma activity file') program_activity_file = get_program_activity_file(base_path) logger.info('Loading program activity: {}'.format(PA_FILE_NAME)) with create_app().app_context(): sess = GlobalDB.db().session try: raw_data = pd.read_csv(program_activity_file, dtype=str) except pd.io.common.EmptyDataError: log_blank_file() exit_if_nonlocal(4) # exit code chosen arbitrarily, to indicate distinct failure states return headers = set([header.upper() for header in list(raw_data)]) if not VALID_HEADERS.issubset(headers): logger.error('Missing required headers. Required headers include: %s' % str(VALID_HEADERS)) exit_if_nonlocal(4) return try: dropped_count, data = clean_data( raw_data, ProgramActivity, {'fyq': 'fiscal_year_period', 'agency_code': 'agency_id', 'allocation_id': 'allocation_transfer_id', 'account_code': 'account_number', 'pa_code': 'program_activity_code', 'pa_title': 'program_activity_name'}, {'program_activity_code': {'pad_to_length': 4}, 'agency_id': {'pad_to_length': 3}, 'allocation_transfer_id': {'pad_to_length': 3, 'keep_null': True}, 'account_number': {'pad_to_length': 4}}, ['agency_id', 'program_activity_code', 'account_number', 'program_activity_name'], True ) except FailureThresholdExceededException as e: if e.count == 0: log_blank_file() exit_if_nonlocal(4) return else: logger.error('Loading of program activity file failed due to exceeded failure threshold. ' 'Application tried to drop {} rows'.format(e.count)) exit_if_nonlocal(5) return metrics_json['records_deleted'] = sess.query(ProgramActivity).delete() metrics_json['invalid_records_dropped'] = dropped_count # Lowercase Program Activity Name data['program_activity_name'] = data['program_activity_name'].apply(lambda x: lowercase_or_notify(x)) # Convert FYQ to FYP data['fiscal_year_period'] = data['fiscal_year_period'].apply(lambda x: convert_fyq_to_fyp(x)) # because we're only loading a subset of program activity info, there will be duplicate records in the # dataframe. this is ok, but need to de-duped before the db load. We also need to log them. base_count = len(data.index) metrics_json['records_received'] = base_count data.drop_duplicates(inplace=True) dupe_count = base_count - len(data.index) logger.info('Dropped {} duplicate rows.'.format(dupe_count)) metrics_json['duplicates_dropped'] = dupe_count # insert to db table_name = ProgramActivity.__table__.name num = insert_dataframe(data, table_name, sess.connection()) sess.commit() if export: export_public_pa(raw_data) end_time = datetime.datetime.now() update_external_data_load_date(now, end_time, 'program_activity') update_external_data_load_date(last_upload, end_time, 'program_activity_upload') logger.info('{} records inserted to {}'.format(num, table_name)) metrics_json['records_inserted'] = num metrics_json['duration'] = str(end_time - now) with open('load_program_activity_metrics.json', 'w+') as metrics_file: json.dump(metrics_json, metrics_file) if dropped_count > 0: exit_if_nonlocal(3) return