Пример #1
0
def reload_from_sam(sess):
    """ Reload current historic recipient data from SAM to pull in any new columns or data

        Args:
            sess: database connection
    """
    historic_recps_to_update = sess.query(
        HistoricDUNS.awardee_or_recipient_uniqu).all()
    for sam_batch in batch(historic_recps_to_update, LOAD_BATCH_SIZE):
        df = pd.DataFrame(columns=['awardee_or_recipient_uniqu'])
        df = df.append(sam_batch)
        df = update_sam_props(df)
        update_sam_recipient(sess, df, table_name=HistoricDUNS.__table__.name)
def backfill_uei_via_entity_api(sess, table):
    """ Backfill any extraneous data (ex. uei) missing from V1 data that wasn't updated by V2

        Args:
            sess: database connection
            table: table to backfill
    """
    duns_to_update = sess.query(table.awardee_or_recipient_uniqu).filter(
        or_(
            SAMRecipient.uei.is_(None),
            and_(SAMRecipient.ultimate_parent_unique_ide.isnot(None),
                 SAMRecipient.ultimate_parent_uei.is_(None)))).all()
    for duns_batch in batch(duns_to_update, LOAD_BATCH_SIZE):
        df = pd.DataFrame(columns=['awardee_or_recipient_uniqu'])
        df = df.append(duns_batch)
        df = update_sam_props(df)
        df = df[['awardee_or_recipient_uniqu', 'uei', 'ultimate_parent_uei']]
        update_sam_recipient(sess, df, table_name=table.__table__.name)
def backfill_uei_crosswalk(sess, table_name):
    """ Backfill any extraneous data (ex. uei) missing from V1 data that wasn't updated by V2

        Args:
            sess: database connection
            table_name: table to backfill
    """
    blank_uei_query = """
        SELECT awardee_or_recipient_uniqu
        FROM {table_name}
        WHERE uei IS NULL;
    """.format(table_name=table_name)
    duns_to_update = [
        row['awardee_or_recipient_uniqu']
        for row in sess.execute(blank_uei_query).fetchall()
    ]
    for duns_batch in batch(duns_to_update, LOAD_BATCH_SIZE):
        df = pd.DataFrame()
        df['awardee_or_recipient_uniqu'] = duns_batch
        df = update_sam_props(df, api='iqaas')
        df = df[['awardee_or_recipient_uniqu', 'uei']]
        update_sam_recipient(sess, df, table_name=table_name)
Пример #4
0
def process_sam_file(data_type, period, version, date, sess, local=None, api=False, metrics=None):
    """ Process the SAM file found locally or remotely

        Args:
            data_type: data type to load (DUNS or executive compensation)
            period: monthly or daily
            version: v1 or v2
            sess: the database connection
            local: path to local directory to process, if None, it will go though the remote SAM service
            api: whether to use the SAM CSV API or not
            metrics: dictionary representing metrics data for the load

        Raises:
            requests.exceptions.HTTPError if the SAM HTTP API doesnt have the file requested
    """
    if not metrics:
        metrics = {}

    root_dir = local if local else tempfile.gettempdir()
    file_name_format = SAM_FILE_FORMAT.format(data_type=DATA_TYPES[data_type], period=period, version=VERSIONS[version])
    file_name = date.strftime(file_name_format)
    if not local:
        download_sam_file(root_dir, file_name, api=api)

    file_path = os.path.join(root_dir, file_name)
    includes_uei = version == 'v2'
    if data_type == 'DUNS':
        add_update_data, delete_data = parse_sam_recipient_file(file_path, metrics=metrics)
        if add_update_data is not None:
            update_sam_recipient(sess, add_update_data, metrics=metrics, includes_uei=includes_uei)
        if delete_data is not None:
            update_sam_recipient(sess, delete_data, metrics=metrics, deletes=True, includes_uei=includes_uei)
    else:
        exec_comp_data = parse_exec_comp_file(file_path, metrics=metrics)
        update_sam_recipient(sess, exec_comp_data, metrics=metrics, includes_uei=includes_uei)
    if not local:
        os.remove(file_path)
Пример #5
0
def run_sam_batches(file, sess, block_size=LOAD_BATCH_SIZE):
    """ Updates Historic DUNS table in chunks from csv file

        Args:
            file: path to the recipient export file to use
            sess: the database connection
            block_size: the size of the batches to read from the recipient export file.
    """
    logger.info("Retrieving total rows from recipients file")
    start = datetime.now()

    # CSV column header name in recipient file
    column_headers = [
        "awardee_or_recipient_uniqu",  # DUNS Field
        "registration_date",  # Registration_Date
        "expiration_date",  # Expiration_Date
        "last_sam_mod_date",  # Last_Update_Date
        "activation_date",  # Activation_Date
        "legal_business_name"  # Legal_Business_Name
    ]
    sam_reader_obj = pd.read_csv(file,
                                 skipinitialspace=True,
                                 header=None,
                                 quotechar='"',
                                 dtype=str,
                                 names=column_headers,
                                 iterator=True,
                                 chunksize=block_size,
                                 skiprows=1)
    sam_dfs = [sam_df for sam_df in sam_reader_obj]
    row_count = sum([len(sam_df.index) for sam_df in sam_dfs])
    logger.info("Retrieved row count of {} in {} s".format(
        row_count, (datetime.now() - start).total_seconds()))

    recipients_added = 0
    for sam_df in sam_dfs:
        # Remove rows where awardee_or_recipient_uniqu is null
        sam_df = sam_df[sam_df['awardee_or_recipient_uniqu'].notnull()]
        # Ignore old recipients we already have
        recps_to_load = remove_existing_recipients(sam_df, sess)

        if not recps_to_load.empty:
            logger.info("Adding {} SAM records from historic data".format(
                len(recps_to_load.index)))
            start = datetime.now()

            # get address info for incoming recipients
            recps_to_load = update_sam_props(recps_to_load)
            column_mappings = {col: col for col in recps_to_load.columns}
            recps_to_load = clean_data(recps_to_load, HistoricDUNS,
                                       column_mappings, {})
            recipients_added += len(recps_to_load.index)
            update_sam_recipient(sess, recps_to_load,
                                 HistoricDUNS.__table__.name)
            sess.commit()

            logger.info("Finished updating {} SAM rows in {} s".format(
                len(recps_to_load.index),
                (datetime.now() - start).total_seconds()))

    logger.info("Imported {} historical recipients".format(recipients_added))