def reload_from_sam(sess): """ Reload current historic recipient data from SAM to pull in any new columns or data Args: sess: database connection """ historic_recps_to_update = sess.query( HistoricDUNS.awardee_or_recipient_uniqu).all() for sam_batch in batch(historic_recps_to_update, LOAD_BATCH_SIZE): df = pd.DataFrame(columns=['awardee_or_recipient_uniqu']) df = df.append(sam_batch) df = update_sam_props(df) update_sam_recipient(sess, df, table_name=HistoricDUNS.__table__.name)
def backfill_uei_via_entity_api(sess, table): """ Backfill any extraneous data (ex. uei) missing from V1 data that wasn't updated by V2 Args: sess: database connection table: table to backfill """ duns_to_update = sess.query(table.awardee_or_recipient_uniqu).filter( or_( SAMRecipient.uei.is_(None), and_(SAMRecipient.ultimate_parent_unique_ide.isnot(None), SAMRecipient.ultimate_parent_uei.is_(None)))).all() for duns_batch in batch(duns_to_update, LOAD_BATCH_SIZE): df = pd.DataFrame(columns=['awardee_or_recipient_uniqu']) df = df.append(duns_batch) df = update_sam_props(df) df = df[['awardee_or_recipient_uniqu', 'uei', 'ultimate_parent_uei']] update_sam_recipient(sess, df, table_name=table.__table__.name)
def backfill_uei_crosswalk(sess, table_name): """ Backfill any extraneous data (ex. uei) missing from V1 data that wasn't updated by V2 Args: sess: database connection table_name: table to backfill """ blank_uei_query = """ SELECT awardee_or_recipient_uniqu FROM {table_name} WHERE uei IS NULL; """.format(table_name=table_name) duns_to_update = [ row['awardee_or_recipient_uniqu'] for row in sess.execute(blank_uei_query).fetchall() ] for duns_batch in batch(duns_to_update, LOAD_BATCH_SIZE): df = pd.DataFrame() df['awardee_or_recipient_uniqu'] = duns_batch df = update_sam_props(df, api='iqaas') df = df[['awardee_or_recipient_uniqu', 'uei']] update_sam_recipient(sess, df, table_name=table_name)
def process_sam_file(data_type, period, version, date, sess, local=None, api=False, metrics=None): """ Process the SAM file found locally or remotely Args: data_type: data type to load (DUNS or executive compensation) period: monthly or daily version: v1 or v2 sess: the database connection local: path to local directory to process, if None, it will go though the remote SAM service api: whether to use the SAM CSV API or not metrics: dictionary representing metrics data for the load Raises: requests.exceptions.HTTPError if the SAM HTTP API doesnt have the file requested """ if not metrics: metrics = {} root_dir = local if local else tempfile.gettempdir() file_name_format = SAM_FILE_FORMAT.format(data_type=DATA_TYPES[data_type], period=period, version=VERSIONS[version]) file_name = date.strftime(file_name_format) if not local: download_sam_file(root_dir, file_name, api=api) file_path = os.path.join(root_dir, file_name) includes_uei = version == 'v2' if data_type == 'DUNS': add_update_data, delete_data = parse_sam_recipient_file(file_path, metrics=metrics) if add_update_data is not None: update_sam_recipient(sess, add_update_data, metrics=metrics, includes_uei=includes_uei) if delete_data is not None: update_sam_recipient(sess, delete_data, metrics=metrics, deletes=True, includes_uei=includes_uei) else: exec_comp_data = parse_exec_comp_file(file_path, metrics=metrics) update_sam_recipient(sess, exec_comp_data, metrics=metrics, includes_uei=includes_uei) if not local: os.remove(file_path)
def run_sam_batches(file, sess, block_size=LOAD_BATCH_SIZE): """ Updates Historic DUNS table in chunks from csv file Args: file: path to the recipient export file to use sess: the database connection block_size: the size of the batches to read from the recipient export file. """ logger.info("Retrieving total rows from recipients file") start = datetime.now() # CSV column header name in recipient file column_headers = [ "awardee_or_recipient_uniqu", # DUNS Field "registration_date", # Registration_Date "expiration_date", # Expiration_Date "last_sam_mod_date", # Last_Update_Date "activation_date", # Activation_Date "legal_business_name" # Legal_Business_Name ] sam_reader_obj = pd.read_csv(file, skipinitialspace=True, header=None, quotechar='"', dtype=str, names=column_headers, iterator=True, chunksize=block_size, skiprows=1) sam_dfs = [sam_df for sam_df in sam_reader_obj] row_count = sum([len(sam_df.index) for sam_df in sam_dfs]) logger.info("Retrieved row count of {} in {} s".format( row_count, (datetime.now() - start).total_seconds())) recipients_added = 0 for sam_df in sam_dfs: # Remove rows where awardee_or_recipient_uniqu is null sam_df = sam_df[sam_df['awardee_or_recipient_uniqu'].notnull()] # Ignore old recipients we already have recps_to_load = remove_existing_recipients(sam_df, sess) if not recps_to_load.empty: logger.info("Adding {} SAM records from historic data".format( len(recps_to_load.index))) start = datetime.now() # get address info for incoming recipients recps_to_load = update_sam_props(recps_to_load) column_mappings = {col: col for col in recps_to_load.columns} recps_to_load = clean_data(recps_to_load, HistoricDUNS, column_mappings, {}) recipients_added += len(recps_to_load.index) update_sam_recipient(sess, recps_to_load, HistoricDUNS.__table__.name) sess.commit() logger.info("Finished updating {} SAM rows in {} s".format( len(recps_to_load.index), (datetime.now() - start).total_seconds())) logger.info("Imported {} historical recipients".format(recipients_added))