Python clean_data示例，dataactvalidator.scripts.loader_utils.clean_data Python示例

示例#1

0

显示文件

文件： load_object_class.py 项目： vcgato29/data-act-broker-backend

def load_object_class(base_path):

    """ This function loads Object classes into the database

        Args:
            base_path: directory that contains the domain values files.
    """
    if CONFIG_BROKER["use_aws"]:
        s3_client = boto3.client('s3', region_name=CONFIG_BROKER['aws_region'])
        filename = s3_client.generate_presigned_url('get_object', {'Bucket': CONFIG_BROKER['sf_133_bucket'],
                                                                   'Key': "object_class.csv"}, ExpiresIn=600)
    else:
        filename = os.path.join(base_path, "object_class.csv")

    # Load object class lookup table
    logger.info('Loading Object Class File: object_class.csv')
    with create_app().app_context():
        sess = GlobalDB.db().session
        sess.query(ObjectClass).delete()

        data = pd.read_csv(filename, dtype=str)
        data = clean_data(
            data,
            ObjectClass,
            {"max_oc_code": "object_class_code", "max_object_class_name": "object_class_name"},
            {"object_class_code": {"pad_to_length": 3}}
        )
        # de-dupe
        data.drop_duplicates(subset=['object_class_code'], inplace=True)
        # insert to db
        table_name = ObjectClass.__table__.name
        num = insert_dataframe(data, table_name, sess.connection())
        sess.commit()

    logger.info('{} records inserted to {}'.format(num, table_name))

示例#2

0

显示文件

文件： read_zips.py 项目： dael-victoria-reyes/data-act-broker-backend

def update_state_congr_table_census(census_file, sess):
    """ Update contents of state_congressional table to include districts from the census

        Args:
            census_file: file path/url to the census file to read
            sess: the database connection
    """
    logger.info(
        "Adding congressional districts from census to the state_congressional table"
    )

    data = pd.read_csv(census_file, dtype=str)
    model = StateCongressional

    data = clean_data(
        data, model, {
            "state_code": "state_code",
            "congressional_district_no": "congressional_district_no",
            "census_year": "census_year"
        }, {'congressional_district_no': {
            "pad_to_length": 2
        }})

    table_name = model.__table__.name
    insert_dataframe(data, table_name, sess.connection())
    sess.commit()

示例#3

0

显示文件

def load_country_codes(base_path):
    """ Load Country Codes into the database.

        Args:
            base_path: directory that contains the domain values files.
    """
    now = datetime.datetime.now()
    metrics_json = {
        'script_name': 'load_country_codes.py',
        'start_time': str(now),
        'records_deleted': 0,
        'records_provided': 0,
        'duplicates_dropped': 0,
        'records_inserted': 0
    }
    if CONFIG_BROKER["use_aws"]:
        s3_client = boto3.client('s3', region_name=CONFIG_BROKER['aws_region'])
        filename = s3_client.generate_presigned_url(
            'get_object', {
                'Bucket': CONFIG_BROKER['sf_133_bucket'],
                'Key': "country_codes.csv"
            },
            ExpiresIn=600)
    else:
        filename = os.path.join(base_path, "country_codes.csv")

    logger.info('Loading country codes file: country_codes.csv')

    with create_app().app_context():
        sess = GlobalDB.db().session

        # for object class, delete and replace values
        metrics_json['records_deleted'] = sess.query(CountryCode).delete()

        data = pd.read_csv(filename, dtype=str)
        metrics_json['records_provided'] = len(data.index)
        data = clean_data(data, CountryCode, {
            "country_code": "country_code",
            "country_name": "country_name"
        }, {})
        # de-dupe
        data.drop_duplicates(subset=['country_code'], inplace=True)
        metrics_json['duplicates_dropped'] = metrics_json[
            'records_provided'] - len(data.index)
        # flag territories or freely associated states
        data["territory_free_state"] = np.where(
            data["country_code"].isin(TERRITORIES_FREE_STATES), True, False)
        # insert to db
        table_name = CountryCode.__table__.name
        num = insert_dataframe(data, table_name, sess.connection())
        metrics_json['records_inserted'] = num
        sess.commit()

    logger.info('{} records inserted to {}'.format(num, table_name))

    metrics_json['duration'] = str(datetime.datetime.now() - now)

    with open('load_country_codes_metrics.json', 'w+') as metrics_file:
        json.dump(metrics_json, metrics_file)
    logger.info("Script complete")

示例#4

0

显示文件

文件： duns.py 项目： usdigitalresponse/data-act-broker-backend

def clean_sam_data(data):
    """ Wrapper around clean_data with the DUNS context

        Args:
            data: the dataframe to be cleaned

        Returns:
            a cleaned/updated dataframe to be imported
    """
    if not data.empty:
        return clean_data(
            data, DUNS, {
                "awardee_or_recipient_uniqu": "awardee_or_recipient_uniqu",
                "activation_date": "activation_date",
                "deactivation_date": "deactivation_date",
                "registration_date": "registration_date",
                "expiration_date": "expiration_date",
                "last_sam_mod_date": "last_sam_mod_date",
                "legal_business_name": "legal_business_name",
                "dba_name": "dba_name",
                "address_line_1": "address_line_1",
                "address_line_2": "address_line_2",
                "city": "city",
                "state": "state",
                "zip": "zip",
                "zip4": "zip4",
                "country_code": "country_code",
                "congressional_district": "congressional_district",
                "entity_structure": "entity_structure",
                "business_types_codes": "business_types_codes",
                "business_types": "business_types",
                "ultimate_parent_legal_enti": "ultimate_parent_legal_enti",
                "ultimate_parent_unique_ide": "ultimate_parent_unique_ide"
            }, {})
    return data

示例#5

0

显示文件

def load_frec(file_name):
    """ Load FREC (high-level agency names) lookup table.

        Args:
            file_name: path/url to the file to be read
    """
    sess = GlobalDB.db().session
    models = {frec.frec_code: frec for frec in sess.query(FREC)}

    # read FREC values from csv
    data = pd.read_csv(file_name, dtype=str)

    # clean data
    data = clean_data(
        data,
        FREC,
        {"frec": "frec_code", "cgac_agency_code": "cgac_code", "frec_entity_description": "agency_name",
         "agency_abbreviation": "agency_abbreviation"},
        {"frec": {"keep_null": False}, "cgac_code": {"pad_to_length": 3}, "frec_code": {"pad_to_length": 4}}
    )
    # de-dupe
    data.drop_duplicates(subset=['frec_code'], inplace=True)
    # create foreign key dicts
    cgac_dict = {str(cgac.cgac_code): cgac.cgac_id for
                 cgac in sess.query(CGAC).filter(CGAC.cgac_code.in_(data["cgac_code"])).all()}

    # insert to db
    delete_missing_frecs(models, data)
    update_frecs(models, data, cgac_dict)
    sess.add_all(models.values())
    sess.commit()

    logger.info('%s FREC records inserted', len(models))

示例#6

0

显示文件

def load_quarterly_threshold():
    """ Loads the quarterly revalidation threshold data. """
    if CONFIG_BROKER["use_aws"]:
        s3_client = boto3.client('s3', region_name=CONFIG_BROKER['aws_region'])
        threshold_file = s3_client.generate_presigned_url('get_object', {'Bucket': CONFIG_BROKER['sf_133_bucket'],
                                                                         'Key': "quarterly_submission_dates.csv"},
                                                          ExpiresIn=600)
    else:
        threshold_file = os.path.join(CONFIG_BROKER["path"], "dataactvalidator", "config",
                                      "quarterly_submission_dates.csv")

    logger.info('Loading quarterly revalidation threshold data')
    with create_app().app_context():
        data = pd.read_csv(threshold_file, dtype=str)

        data = clean_data(
            data,
            QuarterlyRevalidationThreshold,
            {"year": "year", "quarter": "quarter", "window_start": "window_start", "window_end": "window_end"},
            {}
        )

        sess = GlobalDB.db().session
        # delete any data in the QuarterlyRevalidationThreshold table
        sess.query(QuarterlyRevalidationThreshold).delete()

        # insert data into table
        num = insert_dataframe(data, QuarterlyRevalidationThreshold.__table__.name, sess.connection())
        logger.info('{} records inserted to quarterly_revalidation_threshold'.format(num))
        sess.commit()

示例#7

0

显示文件

def load_cgac(file_name):
    """ Load CGAC (high-level agency names) lookup table.

        Args:
            file_name: path/url to the file to be read
    """
    sess = GlobalDB.db().session
    models = {cgac.cgac_code: cgac for cgac in sess.query(CGAC)}

    # read CGAC values from csv
    data = pd.read_csv(file_name, dtype=str)
    # clean data
    data = clean_data(
        data,
        CGAC,
        {"cgac_agency_code": "cgac_code", "agency_name": "agency_name",
         "agency_abbreviation": "agency_abbreviation"},
        {"cgac_code": {"pad_to_length": 3}}
    )
    # de-dupe
    data.drop_duplicates(subset=['cgac_code'], inplace=True)

    delete_missing_cgacs(models, data)
    update_cgacs(models, data)
    sess.add_all(models.values())
    sess.commit()

    logger.info('%s CGAC records inserted', len(models))

示例#8

0

显示文件

文件： duns.py 项目： fedspendingtransparency/data-act-broker-backend

def clean_sam_data(data, table=DUNS):
    """ Wrapper around clean_data with the DUNS context

        Args:
            data: the dataframe to be cleaned
            table: the table to work from (could be DUNS/HistoricParentDuns)

        Returns:
            a cleaned/updated dataframe to be imported
    """
    return clean_data(data, table, {
        "awardee_or_recipient_uniqu": "awardee_or_recipient_uniqu",
        "activation_date": "activation_date",
        "deactivation_date": "deactivation_date",
        "registration_date": "registration_date",
        "expiration_date": "expiration_date",
        "last_sam_mod_date": "last_sam_mod_date",
        "sam_extract_code": "sam_extract_code",
        "legal_business_name": "legal_business_name",
        "dba_name": "dba_name",
        "address_line_1": "address_line_1",
        "address_line_2": "address_line_2",
        "city": "city",
        "state": "state",
        "zip": "zip",
        "zip4": "zip4",
        "country_code": "country_code",
        "congressional_district": "congressional_district",
        "entity_structure": "entity_structure",
        "business_types_codes": "business_types_codes",
        "ultimate_parent_legal_enti": "ultimate_parent_legal_enti",
        "ultimate_parent_unique_ide": "ultimate_parent_unique_ide"
    }, {})

示例#9

0

显示文件

def clean_office(csv_path):
    """ Read a CSV into a dataframe, then use a configured `clean_data` and return the results.

        Args:
            csv_path: path/url to the file to read from
    """
    data = pd.read_csv(csv_path, dtype=str)
    data = clean_data(
        data, FPDSContractingOffice, {
            "department_id": "department_id",
            "department_name": "department_name",
            "agency_code": "agency_code",
            "agency_name": "agency_name",
            "contracting_office_code": "contracting_office_code",
            "contracting_office_name": "contracting_office_name",
            "start_date": "start_date",
            "end_date": "end_date",
            "address_city": "address_city",
            "address_state": "address_state",
            "zip_code": "zip_code",
            "country_code": "country_code"
        }, {"department_id": {
            "pad_to_length": 4
        }})
    return data.where(pd.notnull(data), None)

示例#10

0

显示文件

文件： load_offices.py 项目： fedspendingtransparency/data-act-broker-backend

def clean_office(csv_path):
    """ Read a CSV into a dataframe, then use a configured `clean_data` and return the results.

        Args:
            csv_path: path/url to the file to read from
    """
    data = pd.read_csv(csv_path, dtype=str)
    data = clean_data(
        data,
        FPDSContractingOffice,
        {"department_id": "department_id",
         "department_name": "department_name",
         "agency_code": "agency_code",
         "agency_name": "agency_name",
         "contracting_office_code": "contracting_office_code",
         "contracting_office_name": "contracting_office_name",
         "start_date": "start_date",
         "end_date": "end_date",
         "address_city": "address_city",
         "address_state": "address_state",
         "zip_code": "zip_code",
         "country_code": "country_code"
         },
        {"department_id": {"pad_to_length": 4}}
    )
    return data.where(pd.notnull(data), None)

示例#11

0

显示文件

def clean_sam_data(data, table=DUNS):
    """ Wrapper around clean_data with the DUNS context

        Args:
            data: the dataframe to be cleaned
            table: the table to work from (could be DUNS/HistoricParentDuns)

        Returns:
            a cleaned/updated dataframe to be imported
    """
    return clean_data(
        data, table, {
            "awardee_or_recipient_uniqu": "awardee_or_recipient_uniqu",
            "activation_date": "activation_date",
            "deactivation_date": "deactivation_date",
            "registration_date": "registration_date",
            "expiration_date": "expiration_date",
            "last_sam_mod_date": "last_sam_mod_date",
            "sam_extract_code": "sam_extract_code",
            "legal_business_name": "legal_business_name",
            "dba_name": "dba_name",
            "address_line_1": "address_line_1",
            "address_line_2": "address_line_2",
            "city": "city",
            "state": "state",
            "zip": "zip",
            "zip4": "zip4",
            "country_code": "country_code",
            "congressional_district": "congressional_district",
            "business_types_codes": "business_types_codes",
            "ultimate_parent_legal_enti": "ultimate_parent_legal_enti",
            "ultimate_parent_unique_ide": "ultimate_parent_unique_ide"
        }, {})

示例#12

0

显示文件

def clean_sf133_data(filename, sf133_data):
    """ Clean up the data read in from the file.

        Args:
            filename: name/path of the file to be read
            sf133_data: Data model to clean against
    """
    data = pd.read_csv(filename, dtype=str)
    data = clean_data(
        data,
        sf133_data,
        {"ata": "allocation_transfer_agency",
         "aid": "agency_identifier",
         "availability_type_code": "availability_type_code",
         "bpoa": "beginning_period_of_availa",
         "epoa": "ending_period_of_availabil",
         "main_account": "main_account_code",
         "sub_account": "sub_account_code",
         "fiscal_year": "fiscal_year",
         "period": "period",
         "line_num": "line",
         "amount_summed": "amount",
         "defc": "disaster_emergency_fund_code"},
        {"allocation_transfer_agency": {"pad_to_length": 3},
         "agency_identifier": {"pad_to_length": 3},
         "main_account_code": {"pad_to_length": 4},
         "sub_account_code": {"pad_to_length": 3},
         # next 3 lines handle the TAS fields that shouldn't be padded but should still be empty spaces rather than
         # NULLs. this ensures that the downstream pivot & melt (which insert the missing 0-value SF-133 lines) will
         # work as expected (values used in the pivot index cannot be NULL). the "pad_to_length: 0" works around the
         # fact that sometimes the incoming data for these columns is a single space and sometimes it is blank/NULL.
         "beginning_period_of_availa": {"pad_to_length": 0},
         "ending_period_of_availabil": {"pad_to_length": 0},
         "availability_type_code": {"pad_to_length": 0},
         "amount": {"strip_commas": True}}
    )

    # todo: find out how to handle dup rows (e.g., same tas/period/line number)
    # line numbers 2002 and 2012 are the only duped SF 133 report line numbers, and they are not used by the validation
    # rules, so for now just remove them before loading our SF-133 table
    dupe_line_numbers = ['2002', '2102']
    data = data[~data.line.isin(dupe_line_numbers)]

    # Uppercasing DEFC to save on indexing
    # Empty values are still empty strings ('') at this point
    data['disaster_emergency_fund_code'] = data['disaster_emergency_fund_code'].str.upper()
    data['disaster_emergency_fund_code'] = data['disaster_emergency_fund_code']. \
        apply(lambda x: x.replace('QQQ', 'Q') if x else None)

    # add concatenated TAS field for internal use (i.e., joining to staging tables)
    data['tas'] = data.apply(lambda row: format_internal_tas(row), axis=1)
    data['display_tas'] = ''
    data['amount'] = data['amount'].astype(float)

    data = fill_blank_sf133_lines(data)

    return data

示例#13

0

显示文件

def load_object_class(base_path):
    """ This function loads Object classes into the database

        Args:
            base_path: directory that contains the domain values files.
    """
    now = datetime.datetime.now()
    metrics_json = {
        'script_name': 'load_object_class.py',
        'start_time': str(now),
        'records_received': 0,
        'duplicates_dropped': 0,
        'records_deleted': 0,
        'records_inserted': 0
    }
    if CONFIG_BROKER["use_aws"]:
        s3_client = boto3.client('s3', region_name=CONFIG_BROKER['aws_region'])
        filename = s3_client.generate_presigned_url(
            'get_object', {
                'Bucket': CONFIG_BROKER['sf_133_bucket'],
                'Key': "object_class.csv"
            },
            ExpiresIn=600)
    else:
        filename = os.path.join(base_path, "object_class.csv")

    # Load object class lookup table
    logger.info('Loading Object Class File: object_class.csv')
    with create_app().app_context():
        sess = GlobalDB.db().session
        metrics_json['records_deleted'] = sess.query(ObjectClass).delete()

        data = pd.read_csv(filename, dtype=str)
        data = clean_data(
            data, ObjectClass, {
                "max_oc_code": "object_class_code",
                "max_object_class_name": "object_class_name"
            }, {"object_class_code": {
                "pad_to_length": 3
            }})
        metrics_json['records_received'] = len(data.index)
        # de-dupe
        data.drop_duplicates(subset=['object_class_code'], inplace=True)
        metrics_json['duplicates_dropped'] = metrics_json[
            'records_received'] - len(data.index)
        # insert to db
        table_name = ObjectClass.__table__.name
        num = insert_dataframe(data, table_name, sess.connection())
        sess.commit()

    logger.info('{} records inserted to {}'.format(num, table_name))
    metrics_json['records_inserted'] = num

    metrics_json['duration'] = str(datetime.datetime.now() - now)

    with open('load_object_class_metrics.json', 'w+') as metrics_file:
        json.dump(metrics_json, metrics_file)

示例#14

0

显示文件

文件： load_agencies.py 项目： fedspendingtransparency/data-act-broker-backend

def load_frec(file_name, force_reload=False):
    """ Load FREC (high-level agency names) lookup table.

        Args:
            file_name: path/url to the file to be read
            force_reload: whether to reload regardless

        Returns:
            True if new data was loaded, False if the load was skipped
    """
    sess = GlobalDB.db().session
    models = {frec.frec_code: frec for frec in sess.query(FREC)}

    # read FREC values from csv
    data = pd.read_csv(file_name, dtype=str)

    # clean data
    data = clean_data(
        data,
        FREC,
        {'frec': 'frec_code', 'cgac_agency_code': 'cgac_code', 'frec_entity_description': 'agency_name',
         'frec_abbreviation': 'frec_abbreviation', 'frec_cgac_association': 'frec_cgac',
         'icon_filename': 'icon_name'},
        {'frec': {'keep_null': False}, 'cgac_code': {'pad_to_length': 3}, 'frec_code': {'pad_to_length': 4}}
    )
    data['icon_name'] = data['icon_name'].apply(clean_col, args=False)
    # de-dupe
    data = data[data.frec_cgac == 'TRUE']
    data.drop(['frec_cgac'], axis=1, inplace=True)
    data.drop_duplicates(subset=['frec_code'], inplace=True)
    # create foreign key dicts
    cgac_dict = {str(cgac.cgac_code): cgac.cgac_id for
                 cgac in sess.query(CGAC).filter(CGAC.cgac_code.in_(data['cgac_code'])).all()}
    cgac_dict_flipped = {cgac_id: cgac_code for cgac_code, cgac_id in cgac_dict.items()}

    # compare to existing content in table
    def extract_cgac(row):
        return cgac_dict_flipped[row['cgac_id']] if row['cgac_id'] in cgac_dict_flipped else np.nan

    diff_found = check_dataframe_diff(data, FREC, ['frec_id', 'cgac_id'], ['frec_code'],
                                      lambda_funcs=[('frec_abbreviation', extract_abbreviation),
                                                    ('agency_name', extract_name),
                                                    ('cgac_code', extract_cgac)])
    if force_reload or diff_found:
        # create foreign key dicts

        # insert to db
        delete_missing_frecs(models, data)
        update_frecs(models, data, cgac_dict)
        sess.add_all(models.values())
        sess.commit()

        logger.info('%s FREC records inserted', len(models))
        return True
    else:
        logger.info('No differences found, skipping frec table reload.')
        return False

示例#15

0

显示文件

def load_object_class(base_path):
    """ This function loads Object classes into the database

        Args:
            base_path: directory that contains the domain values files.
    """
    now = datetime.datetime.now()
    metrics_json = {
        'script_name': 'load_object_class.py',
        'start_time': str(now),
        'records_received': 0,
        'duplicates_dropped': 0,
        'records_deleted': 0,
        'records_inserted': 0
    }

    filename = os.path.join(base_path, 'object_class.csv')
    try:
        # Update file from public S3 bucket
        object_class_url = '{}/object_class.csv'.format(CONFIG_BROKER['usas_public_reference_url'])
        r = requests.get(object_class_url, allow_redirects=True)
        open(filename, 'wb').write(r.content)
    except Exception:
        pass

    # Load object class lookup table
    logger.info('Loading Object Class File: object_class.csv')
    with create_app().app_context():
        sess = GlobalDB.db().session
        metrics_json['records_deleted'] = sess.query(ObjectClass).delete()

        data = pd.read_csv(filename, dtype=str)
        data = clean_data(
            data,
            ObjectClass,
            {"max_oc_code": "object_class_code", "max_object_class_name": "object_class_name"},
            {"object_class_code": {"pad_to_length": 3}}
        )
        metrics_json['records_received'] = len(data.index)
        # de-dupe
        data.drop_duplicates(subset=['object_class_code'], inplace=True)
        metrics_json['duplicates_dropped'] = metrics_json['records_received'] - len(data.index)
        # insert to db
        table_name = ObjectClass.__table__.name
        num = insert_dataframe(data, table_name, sess.connection())
        sess.commit()

    logger.info('{} records inserted to {}'.format(num, table_name))
    metrics_json['records_inserted'] = num

    update_external_data_load_date(now, datetime.datetime.now(), 'object_class')

    metrics_json['duration'] = str(datetime.datetime.now() - now)

    with open('load_object_class_metrics.json', 'w+') as metrics_file:
        json.dump(metrics_json, metrics_file)

示例#16

0

显示文件

文件： update_historical_duns.py 项目： vcgato29/data-act-broker-backend

def clean_duns_csv_data(data):
    """ Simple wrapper around clean_data applied just for duns

        Args:
            data: dataframe representing the data to be cleaned

        Returns:
            a dataframe cleaned and to be imported to the database
    """
    return clean_data(data, DUNS, column_mappings, {})

示例#17

0

显示文件

文件： update_historical_duns.py 项目： fedspendingtransparency/data-act-broker-backend

def clean_duns_csv_data(data):
    """ Simple wrapper around clean_data applied just for duns

        Args:
            data: dataframe representing the data to be cleaned

        Returns:
            a dataframe cleaned and to be imported to the database
    """
    return clean_data(data, DUNS, column_mappings, {})

示例#18

0

显示文件

def run_duns_batches(file, sess, client, block_size=10000):
    """ Updates DUNS table in chunks from csv file

        Args:
            file: path to the DUNS export file to use
            sess: the database connection
            client: the connection to the SAM service
            block_size: the size of the batches to read from the DUNS export file.
    """
    logger.info("Retrieving total rows from duns file")
    start = datetime.now()
    duns_reader_obj = pd.read_csv(file,
                                  skipinitialspace=True,
                                  header=None,
                                  quotechar='"',
                                  dtype=str,
                                  names=column_headers,
                                  iterator=True,
                                  chunksize=block_size,
                                  skiprows=1)
    duns_dfs = [duns_df for duns_df in duns_reader_obj]
    row_count = sum([len(duns_df.index) for duns_df in duns_dfs])
    logger.info("Retrieved row count of {} in {} s".format(
        row_count, (datetime.now() - start).total_seconds()))

    duns_added = 0
    for duns_df in duns_dfs:
        # Remove rows where awardee_or_recipient_uniqu is null
        duns_df = duns_df[duns_df['awardee_or_recipient_uniqu'].notnull()]
        # Ignore old DUNS we already have
        duns_to_load = remove_existing_duns(duns_df, sess)

        if not duns_to_load.empty:
            logger.info("Adding {} DUNS records from historic data".format(
                len(duns_to_load.index)))
            start = datetime.now()

            # get address info for incoming duns
            duns_to_load = update_duns_props(duns_to_load, client)
            duns_to_load = clean_data(duns_to_load, HistoricDUNS,
                                      column_mappings, {})
            duns_added += len(duns_to_load.index)

            insert_dataframe(duns_to_load, HistoricDUNS.__table__.name,
                             sess.connection())
            sess.commit()

            logger.info("Finished updating {} DUNS rows in {} s".format(
                len(duns_to_load.index),
                (datetime.now() - start).total_seconds()))

    logger.info("Imported {} historical duns".format(duns_added))

示例#19

0

显示文件

def clean_sam_data(data):
    """ Wrapper around clean_data with the SAM Recipient context

        Args:
            data: the dataframe to be cleaned

        Returns:
            a cleaned/updated dataframe to be imported
    """
    if not data.empty:
        column_mappings = {col: col for col in data.columns}
        return clean_data(data, SAMRecipient, column_mappings, {})
    return data

示例#20

0

显示文件

文件： load_historical_fabs_extra_columns.py 项目： vcgato29/data-act-broker-backend

def format_fabs_data(data):
    # drop all records without any data to be loaded
    data = data.replace('', np.nan, inplace=True)
    data.dropna(subset=["awarding office code", "awarding office name", "funding office name", "funding office code",
                        "funding agency name", "funding agency code", "funding sub tier agency code",
                        "funding sub tier agency name", "legal entity foreign city", "legal entity foreign province",
                        "legal entity foreign postal code", "legal entity foreign location description"], inplace=True)

    # ensure there are rows to be cleaned and formatted
    if len(data.index) == 0:
        return None

    cdata = clean_data(
        data,
        PublishedAwardFinancialAssistance,
        {
            "agency_code": "awarding_sub_tier_agency_c",
            "federal_award_mod": "award_modification_amendme",
            "federal_award_id": "fain",
            "uri": "uri",
            "awarding office code": "awarding_office_code",
            "awarding office name": "awarding_office_name",
            "funding office name": "funding_office_name",
            "funding office code": "funding_office_code",
            "funding agency name": "funding_agency_name",
            "funding agency code": "funding_agency_code",
            "funding sub tier agency code": "funding_sub_tier_agency_co",
            "funding sub tier agency name": "funding_sub_tier_agency_na",
            "legal entity foreign city": "legal_entity_foreign_city",
            "legal entity foreign province": "legal_entity_foreign_provi",
            "legal entity foreign postal code": "legal_entity_foreign_posta",
            "legal entity foreign location description": "legal_entity_foreign_descr"
        }, {}
    )

    # make a pass through the dataframe, changing any empty values to None, to ensure that those are represented as
    # NULL in the db.
    cdata = cdata.replace(np.nan, '', regex=True)
    cdata = cdata.applymap(lambda x: str(x).strip() if len(str(x).strip()) else None)

    # generate the afa_generated_unique field
    cdata['afa_generated_unique'] = cdata.apply(lambda x: generate_unique_string(x), axis=1)

    # drop columns in afa_generated_unique because we aren't updating them
    for col in ["awarding_sub_tier_agency_c", "award_modification_amendme", "fain", "uri"]:
        del cdata[col]

    return cdata

示例#21

0

显示文件

def load_defc(force_reload=False):
    """ Loads the DEFC data.

        Args:
            force_reload: boolean to determine if reload should happen whether there are differences or not
    """
    start_time = datetime.now()
    defc_file = os.path.join(CONFIG_BROKER['path'], 'dataactvalidator',
                             'config', 'def_codes.csv')

    try:
        # Update file from public S3 bucket
        def_codes_url = '{}/def_codes.csv'.format(
            CONFIG_BROKER['usas_public_reference_url'])
        r = requests.get(def_codes_url, allow_redirects=True)
        open(defc_file, 'wb').write(r.content)
    except Exception:
        pass

    logger.info('Loading defc data')
    with create_app().app_context():
        data = pd.read_csv(defc_file, dtype=str)

        # Remove all invalid DEFCs that have been left in the file so USAS can continue to display them correctly
        data = data[data['Is Valid'] == 'true']

        data = clean_data(data, DEFC, {
            'defc': 'code',
            'group_name': 'group'
        }, {})

        diff_found = check_dataframe_diff(data, DEFC, ['defc_id'], ['code'])

        if force_reload or diff_found:
            sess = GlobalDB.db().session
            # delete any data in the DEFC table
            sess.query(DEFC).delete()

            # insert data into table
            num = insert_dataframe(data, DEFC.__table__.name,
                                   sess.connection())
            logger.info('{} records inserted to defc'.format(num))
            sess.commit()
            update_external_data_load_date(start_time, datetime.now(), 'defc')
        else:
            logger.info('No differences found, skipping defc table reload.')

示例#22

0

显示文件

def load_submission_window_schedule():
    """ Loads the submission window schedule data. """
    if CONFIG_BROKER["use_aws"]:
        s3_client = boto3.client('s3', region_name=CONFIG_BROKER['aws_region'])
        sub_schedule_file = s3_client.generate_presigned_url(
            'get_object', {
                'Bucket': CONFIG_BROKER['sf_133_bucket'],
                'Key': "submission_window_schedule.csv"
            },
            ExpiresIn=600)
    else:
        sub_schedule_file = os.path.join(CONFIG_BROKER['path'],
                                         'dataactvalidator', 'config',
                                         'submission_window_schedule.csv')

    logger.info('Loading submission window schedule data')
    with create_app().app_context():
        data = pd.read_csv(sub_schedule_file, dtype=str)

        data = clean_data(
            data, SubmissionWindowSchedule, {
                'year': 'year',
                'period': 'period',
                'period_start': 'period_start',
                'publish_deadline': 'publish_deadline',
                'certification_deadline': 'certification_deadline'
            }, {})

        # Add a day to the deadlines because the dates in the file are supposed to be inclusive
        data['publish_deadline'] = data.apply(
            lambda x: add_day(x, 'publish_deadline'), axis=1)
        data['certification_deadline'] = data.apply(
            lambda x: add_day(x, 'certification_deadline'), axis=1)

        sess = GlobalDB.db().session
        # delete any data in the SubmissionWindowSchedule table
        sess.query(SubmissionWindowSchedule).delete()

        # insert data into table
        num = insert_dataframe(data, SubmissionWindowSchedule.__table__.name,
                               sess.connection())
        logger.info(
            '{} records inserted to submission_window_schedule'.format(num))
        sess.commit()

示例#23

0

显示文件

文件： sqlLoader.py 项目： fedspendingtransparency/data-act-broker-backend

    def load_sql(cls, filename):
        """ Load SQL-based validation rules to db. """
        with create_app().app_context():
            sess = GlobalDB.db().session
            filename = os.path.join(cls.sql_rules_path, filename)

            # Initial load
            sql_data = pd.read_csv(filename, dtype=str, usecols=cls.headers)
            sql_data = clean_data(
                sql_data,
                RuleSql,
                {'rule_label': 'rule_label', 'rule_error_message': 'rule_error_message', 'query_name': 'query_name',
                 'expected_value': 'expected_value', 'category': 'category', 'file_type': 'file_type',
                 'target_file': 'target_file', 'rule_cross_file_flag': 'rule_cross_file_flag',
                 'severity_name': 'severity_name'},
                {}
            )

            # Processing certain values
            sql_data['rule_sql'] = sql_data['query_name'].apply(lambda name: cls.read_sql_str(name))
            sql_data['file_id'] = sql_data['file_type'].apply(lambda type: FILE_TYPE_DICT.get(type, None))
            if sql_data['file_id'].isnull().values.any():
                raise Exception('Invalid file_type value found in sqlLoader. Must be one of the following: {}'
                                .format(', '.join(list(FILE_TYPE_DICT.keys()))))
            sql_data['target_file_id'] = sql_data['target_file'].apply(lambda type: FILE_TYPE_DICT.get(type, None))
            sql_data['rule_cross_file_flag'] = sql_data['rule_cross_file_flag'].apply(lambda flag:
                                                                                      flag in ('true', 't', 'y', 'yes'))
            sql_data['rule_severity_id'] = sql_data['severity_name'].apply(lambda severity_name:
                                                                           RULE_SEVERITY_DICT.get(severity_name, None))
            if sql_data['rule_severity_id'].isnull().values.any():
                raise Exception('Invalid severity_name value found in sqlLoader Must be one of the following: {}'
                                .format(', '.join(list(RULE_SEVERITY_DICT.keys()))))
            sql_data.drop(['file_type', 'severity_name', 'target_file'], axis=1, inplace=True)

            # Final check if we need to actually reload
            if check_dataframe_diff(sql_data, RuleSql, del_cols=['rule_sql_id', 'created_at', 'updated_at'],
                                    sort_cols=['rule_label', 'file_id', 'target_file_id']):
                # Delete and reload all records currently in table
                logger.info('Detected changes in {}, deleting RuleSQL and reloading'.format(cls.sql_rules_path))
                sess.query(RuleSql).delete()
                insert_dataframe(sql_data, RuleSql.__table__.name, sess.connection())
                sess.commit()
            else:
                logger.info('No changes detected since last load. Skipping.')

示例#24

0

显示文件

文件： load_agencies.py 项目： fedspendingtransparency/data-act-broker-backend

def load_cgac(file_name, force_reload=False):
    """ Load CGAC (high-level agency names) lookup table.

        Args:
            file_name: path/url to the file to be read
            force_reload: whether to reload regardless

        Returns:
            True if new data was loaded, False if the load was skipped
    """
    sess = GlobalDB.db().session
    models = {cgac.cgac_code: cgac for cgac in sess.query(CGAC)}

    # read CGAC values from csv
    data = pd.read_csv(file_name, dtype=str)
    # clean data
    data = clean_data(
        data,
        CGAC,
        {'cgac_agency_code': 'cgac_code', 'agency_name': 'agency_name', 'agency_abbreviation': 'agency_abbreviation',
         'icon_filename': 'icon_name'},
        {'cgac_code': {'pad_to_length': 3}}
    )
    data['icon_name'] = data['icon_name'].apply(clean_col, args=False)
    # de-dupe
    data.drop_duplicates(subset=['cgac_code'], inplace=True)

    # compare to existing content in table
    diff_found = check_dataframe_diff(data, CGAC, ['cgac_id'], ['cgac_code'],
                                      lambda_funcs=[('agency_abbreviation', extract_abbreviation),
                                                    ('agency_name', extract_name)])

    if force_reload or diff_found:
        delete_missing_cgacs(models, data)
        update_cgacs(models, data)
        sess.add_all(models.values())
        sess.commit()

        logger.info('%s CGAC records inserted', len(models))
        return True
    else:
        logger.info('No differences found, skipping cgac table reload.')
        return False

示例#25

0

显示文件

文件： load_tas.py 项目： fedspendingtransparency/data-act-broker-backend

def clean_tas(csv_path):
    """ Read a CSV into a dataframe, then use a configured `clean_data` and return the results

        Args:
            csv_path: path of the car_tas csv to import

        Returns:
            pandas dataframe of clean data imported from the cars_tas csv
    """
    # Encoding accounts for cases where a column may include '\ufeff'
    data = pd.read_csv(csv_path, dtype=str, encoding='utf-8-sig')
    for column_mappings in [current_mappings, original_mappings]:
        try:
            data = clean_data(
                data,
                TASLookup,
                column_mappings,
                {"allocation_transfer_agency": {"pad_to_length": 3, "keep_null": True},
                 "agency_identifier": {"pad_to_length": 3},
                 # Account for " " cells
                 "availability_type_code": {"pad_to_length": 0, "keep_null": True},
                 "beginning_period_of_availa": {"pad_to_length": 0, "keep_null": True},
                 "ending_period_of_availabil": {"pad_to_length": 0, "keep_null": True},
                 "main_account_code": {"pad_to_length": 4},
                 "sub_account_code": {"pad_to_length": 3},
                 "budget_function_code": {"pad_to_length": 3, "keep_null": True},
                 "budget_subfunction_code": {"pad_to_length": 3, "keep_null": True},
                 "budget_bureau_code": {"pad_to_length": 2, "keep_null": True},
                 "reporting_agency_aid": {"pad_to_length": 3, "keep_null": True}
                 }
            )
            break
        except ValueError as e:
            if column_mappings != original_mappings:
                logger.info('Mismatched columns, trying again with different column set')
            else:
                logger.error('Encountered new column set: {}'.format(data.columns))
                raise e
    # Drop all but the last instance of each account number
    data = data[~data.duplicated(subset=['account_num'], keep='last')]
    data["account_num"] = pd.to_numeric(data['account_num'])
    return data.where(pd.notnull(data), None)

示例#26

0

显示文件

文件： read_zips.py 项目： vcgato29/data-act-broker-backend

def update_state_congr_table_census(census_file, sess):
    logger.info(
        "Adding congressional districtions from census to the state_congressional table"
    )

    data = pd.read_csv(census_file, dtype=str)
    model = StateCongressional

    data = clean_data(
        data, model, {
            "state_code": "state_code",
            "congressional_district_no": "congressional_district_no",
            "census_year": "census_year"
        }, {'congressional_district_no': {
            "pad_to_length": 2
        }})

    table_name = model.__table__.name
    insert_dataframe(data, table_name, sess.connection())
    sess.commit()

示例#27

0

显示文件

def load_sub_tier_agencies(file_name):
    """ Load Sub Tier Agency (sub_tier-level agency names) lookup table.

        Args:
            file_name: path/url to the file to be read
    """
    sess = GlobalDB.db().session
    models = {sub_tier_agency.sub_tier_agency_code: sub_tier_agency for
              sub_tier_agency in sess.query(SubTierAgency)}

    # read Sub Tier Agency values from csv
    data = pd.read_csv(file_name, dtype=str)

    condition = data["FPDS DEPARTMENT ID"] == data["SUBTIER CODE"]
    data.loc[condition, "PRIORITY"] = 1
    data.loc[~condition, "PRIORITY"] = 2

    # clean data
    data = clean_data(
        data,
        SubTierAgency,
        {"cgac_agency_code": "cgac_code", "subtier_code": "sub_tier_agency_code", "priority": "priority",
         "frec": "frec_code", "subtier_name": "sub_tier_agency_name", "is_frec": "is_frec"},
        {"cgac_code": {"pad_to_length": 3}, "frec_code": {"pad_to_length": 4},
         "sub_tier_agency_code": {"pad_to_length": 4}}
    )
    # de-dupe
    data.drop_duplicates(subset=['sub_tier_agency_code'], inplace=True)
    # create foreign key dicts
    cgac_dict = {str(cgac.cgac_code): cgac.cgac_id for
                 cgac in sess.query(CGAC).filter(CGAC.cgac_code.in_(data["cgac_code"])).all()}
    frec_dict = {str(frec.frec_code): frec.frec_id for
                 frec in sess.query(FREC).filter(FREC.frec_code.in_(data["frec_code"])).all()}

    delete_missing_sub_tier_agencies(models, data)
    update_sub_tier_agencies(models, data, cgac_dict, frec_dict)
    sess.add_all(models.values())
    sess.commit()

    logger.info('%s Sub Tier Agency records inserted', len(models))

示例#28

0

显示文件

文件： load_country_codes.py 项目： vcgato29/data-act-broker-backend

def load_country_codes(base_path):
    """ Load Country Codes into the database.

        Args:
            base_path: directory that contains the domain values files.
    """

    if CONFIG_BROKER["use_aws"]:
        s3_client = boto3.client('s3', region_name=CONFIG_BROKER['aws_region'])
        filename = s3_client.generate_presigned_url('get_object', {'Bucket': CONFIG_BROKER['sf_133_bucket'],
                                                                   'Key': "country_codes.csv"}, ExpiresIn=600)
    else:
        filename = os.path.join(base_path, "country_codes.csv")

    logger.info('Loading country codes file: country_codes.csv')

    with create_app().app_context():
        sess = GlobalDB.db().session
        # for object class, delete and replace values
        sess.query(CountryCode).delete()

        data = pd.read_csv(filename, dtype=str)
        data = clean_data(
            data,
            CountryCode,
            {"country_code": "country_code", "country_name": "country_name"},
            {}
        )
        # de-dupe
        data.drop_duplicates(subset=['country_code'], inplace=True)
        # insert to db
        table_name = CountryCode.__table__.name
        num = insert_dataframe(data, table_name, sess.connection())
        sess.commit()

    logger.info('{} records inserted to {}'.format(num, table_name))

示例#29

0

显示文件

文件： read_zips.py 项目： fedspendingtransparency/data-act-broker-backend

def update_state_congr_table_census(census_file, sess):
    """ Update contents of state_congressional table to include districts from the census

        Args:
            census_file: file path/url to the census file to read
            sess: the database connection
    """
    logger.info("Adding congressional districts from census to the state_congressional table")

    data = pd.read_csv(census_file, dtype=str)
    model = StateCongressional

    data = clean_data(
        data,
        model,
        {"state_code": "state_code",
         "congressional_district_no": "congressional_district_no",
         "census_year": "census_year"},
        {'congressional_district_no': {"pad_to_length": 2}}
    )

    table_name = model.__table__.name
    insert_dataframe(data, table_name, sess.connection())
    sess.commit()

示例#30

0

显示文件

文件： load_tas.py 项目： fedspendingtransparency/data-act-broker-backend

def clean_tas(csv_path, metrics=None):
    """ Read a CSV into a dataframe, then use a configured `clean_data` and return the results

        Args:
            csv_path: path of the car_tas csv to import
            metrics: an object containing information for the metrics file

        Returns:
            pandas dataframe of clean data imported from the cars_tas csv
    """
    if not metrics:
        metrics = {'records_provided': 0, 'duplicates_dropped': 0}
    # Encoding accounts for cases where a column may include '\ufeff'
    data = pd.read_csv(csv_path, dtype=str, encoding='utf-8-sig')
    metrics['records_provided'] += len(data.index)
    for column_mappings in [current_mappings, original_mappings]:
        try:
            data = clean_data(
                data,
                TASLookup,
                column_mappings,
                {
                    "allocation_transfer_agency": {
                        "pad_to_length": 3,
                        "keep_null": True
                    },
                    "agency_identifier": {
                        "pad_to_length": 3
                    },
                    # Account for " " cells
                    "availability_type_code": {
                        "pad_to_length": 0,
                        "keep_null": True
                    },
                    "beginning_period_of_availa": {
                        "pad_to_length": 0,
                        "keep_null": True
                    },
                    "ending_period_of_availabil": {
                        "pad_to_length": 0,
                        "keep_null": True
                    },
                    "main_account_code": {
                        "pad_to_length": 4
                    },
                    "sub_account_code": {
                        "pad_to_length": 3
                    },
                    "budget_function_code": {
                        "pad_to_length": 3,
                        "keep_null": True
                    },
                    "budget_subfunction_code": {
                        "pad_to_length": 3,
                        "keep_null": True
                    },
                    "budget_bureau_code": {
                        "pad_to_length": 2,
                        "keep_null": True
                    },
                    "reporting_agency_aid": {
                        "pad_to_length": 3,
                        "keep_null": True
                    }
                })
            break
        except ValueError as e:
            if column_mappings != original_mappings:
                logger.info(
                    'Mismatched columns, trying again with different column set'
                )
            else:
                logger.error('Encountered new column set: {}'.format(
                    data.columns))
                raise e
    # Drop all but the last instance of each account number
    data = data[~data.duplicated(subset=['account_num'], keep='last')]
    metrics['duplicates_dropped'] += metrics['records_provided'] - len(
        data.index)
    data["account_num"] = pd.to_numeric(data['account_num'])
    return data.where(pd.notnull(data), None)

示例#31

0

显示文件

def load_cfda_program(base_path):
    """ Load cfda program.

        Args:
            base_path: directory that contains the cfda values files.
    """
    if CONFIG_BROKER["use_aws"]:
        s3_client = boto3.client('s3', region_name=CONFIG_BROKER['aws_region'])
        filename = s3_client.generate_presigned_url(
            'get_object', {
                'Bucket': CONFIG_BROKER['sf_133_bucket'],
                'Key': "cfda_program.csv"
            },
            ExpiresIn=600)
    else:
        filename = os.path.join(base_path, "cfda_program.csv")

    logger.info('Loading CFDA program file: ' + "cfda_program.csv")
    """Load country code lookup table."""
    model = CFDAProgram

    with create_app().app_context():
        configure_logging()
        sess = GlobalDB.db().session
        # for object class, delete and replace values
        sess.query(model).delete()

        data = pd.read_csv(filename, dtype=str, encoding='latin1')

        data = clean_data(
            data, model, {
                "program_title": "program_title",
                "program_number": "program_number",
                "popular_name_(020)": "popular_name",
                "federal_agency_(030)": "federal_agency",
                "authorization_(040)": "authorization",
                "objectives_(050)": "objectives",
                "types_of_assistance_(060)": "types_of_assistance",
                "uses_and_use_restrictions_(070)": "uses_and_use_restrictions",
                "applicant_eligibility_(081)": "applicant_eligibility",
                "beneficiary_eligibility_(082)": "beneficiary_eligibility",
                "credentials/documentation_(083)": "credentials_documentation",
                "preapplication_coordination_(091)":
                "preapplication_coordination",
                "application_procedures_(092)": "application_procedures",
                "award_procedure_(093)": "award_procedure",
                "deadlines_(094)": "deadlines",
                "range_of_approval/disapproval_time_(095)":
                "range_of_approval_disapproval_time",
                "appeals_(096)": "appeals",
                "renewals_(097)": "renewals",
                "formula_and_matching_requirements_(101)":
                "formula_and_matching_requirements",
                "length_and_time_phasing_of_assistance_(102)":
                "length_and_time_phasing_of_assistance",
                "reports_(111)": "reports",
                "audits_(112)": "audits",
                "records_(113)": "records",
                "account_identification_(121)": "account_identification",
                "obligations_(122)": "obligations",
                "range_and_average_of_financial_assistance_(123)":
                "range_and_average_of_financial_assistance",
                "program_accomplishments_(130)": "program_accomplishments",
                "regulations__guidelines__and_literature_(140)":
                "regulations_guidelines_and_literature",
                "regional_or__local_office_(151)": "regional_or_local_office",
                "headquarters_office_(152)": "headquarters_office",
                "website_address_(153)": "website_address",
                "related_programs_(160)": "related_programs",
                "examples_of_funded_projects_(170)":
                "examples_of_funded_projects",
                "criteria_for_selecting_proposals_(180)":
                "criteria_for_selecting_proposals",
                "url": "url",
                "recovery": "recovery",
                "omb_agency_code": "omb_agency_code",
                "omb_bureau_code": "omb_bureau_code",
                "published_date": "published_date",
                "archived_date": "archived_date"
            }, {})
        data["published_date"] = format_date(data["published_date"])
        data["archived_date"] = format_date(data["archived_date"])

        # insert to db
        table_name = model.__table__.name
        num = insert_dataframe(data, table_name, sess.connection())
        sess.commit()

    logger.info('{} records inserted to {}'.format(num, table_name))

示例#32

0

显示文件

文件： load_cfda_data.py 项目： fedspendingtransparency/data-act-broker-backend

def load_cfda_program(base_path, load_local=False, local_file_name="cfda_program.csv"):
    """ Load cfda program.

        Args:
            base_path: directory that contains the cfda values files.
    """
    if not load_local:
        logger.info("Fetching CFDA file from {}".format(S3_CFDA_FILE))
        tmp_name = str(time.time()).replace(".", "") + "_cfda_program.csv"
        filename = os.path.join(base_path, tmp_name)
        r = requests.get(S3_CFDA_FILE, allow_redirects=True)
        open(filename, 'wb').write(r.content)
    else:
        filename = os.path.join(base_path, local_file_name)
    logger.info('Loading CFDA program file: ' + filename)
    """Load country code lookup table."""
    model = CFDAProgram

    def fix_program_number(n, decimals=3):
        multiplier = 10 ** decimals
        value = math.floor(n * multiplier + 0.5) / multiplier
        return str(value).ljust(6, '0')

    with create_app().app_context():
        configure_logging()
        sess = GlobalDB.db().session

        now = datetime.utcnow()
        import_data = pd.read_csv(filename, dtype=str, encoding='cp1252', na_filter=False)
        import_data = clean_data(
            import_data,
            model,
            DATA_CLEANING_MAP,
            {}
        )
        import_data["published_date"] = format_date(import_data["published_date"])
        import_data["archived_date"] = format_date(import_data["archived_date"])
        import_dataframe = import_data.copy(deep=True)
        # To do the comparison, first we need to mock the pk column that postgres creates. We'll set it universally to 1
        import_dataframe = import_dataframe.assign(cfda_program_id=1, created_at=now, updated_at=now)

        table_name = model.__table__.name
        current_data = pd.read_sql_table(table_name, sess.connection(), coerce_float=False)
        # Now we need to overwrite the db's audit dates in the created dataframe, and
        # also set all the  pks to 1, so they match
        current_data = current_data.assign(cfda_program_id=1, created_at=now, updated_at=now)
        # pandas comparison requires everything to be in the same order
        current_data.sort_values('program_number', inplace=True)
        import_dataframe.sort_values('program_number', inplace=True)

        # columns too
        cols = import_dataframe.columns.tolist()
        cols.sort()
        import_dataframe = import_dataframe[cols]

        cols = current_data.columns.tolist()
        cols.sort()
        current_data = current_data[cols]

        # need to reset the indexes now that we've done all this sorting, so that they match
        import_dataframe.reset_index(drop=True, inplace=True)
        current_data.reset_index(drop=True, inplace=True)
        # My favorite part: When pandas pulls the data out of postgres, the program_number column
        # is a Decimal. However, in adding it to the dataframe, this column loses precision.
        # So for example, a program number  of 10.001 imports into the dataframe as 10.000999999999999.
        # It also needs to be cast to astring, and padded with the right number of zeroes, as needed.
        current_data['program_number'] = current_data['program_number'].apply(lambda x: fix_program_number(x))
        # Finally, you can execute this and get True back if the data truly has not changed from the last
        # time the CSV was loaded.
        new_data = not import_dataframe.equals(current_data)
        if new_data:
            # insert to db
            sess.query(model).delete()
            num = insert_dataframe(import_data, table_name, sess.connection())
            sess.commit()
    if not load_local:
        os.remove(filename)
    if new_data:
        logger.info('{} records inserted to {}'.format(num, table_name))
    else:
        logger.info("Skipped cfda load, no new data.")
        sys.exit(3)

示例#33

0

显示文件

文件： load_historical_fabs.py 项目： fedspendingtransparency/data-act-broker-backend

def format_fabs_data(data, sess, fips_state_list, state_code_list, sub_tier_list, county_code_list):
    logger.info("formatting data")

    # drop rows with null FAIN and URI
    data = data[~((data['federal_award_id'].isnull()) & (data['uri'].isnull()))].copy()

    if len(data.index) == 0:
        return None

    proper_casing_cols = ['recipient_name', 'recipient_city_name', 'recipient_county_name', 'receip_addr1',
                          'receip_addr2', 'receip_addr3']
    for col in proper_casing_cols:
        data[col] = data.apply(lambda x: format_proper_casing(x, col), axis=1)

    cols_with_colons = ['action_type', 'assistance_type', 'agency_code', 'recipient_type', 'correction_late_ind']
    for col in cols_with_colons:
        data[col] = data.apply(lambda x: remove_data_after_colon(x, col), axis=1)

    # data['recipient_city_code'] = data.apply(lambda x: format_integer_code(x, 'recipient_city_code', 5), axis=1)
    data['recipient_county_code'] = data.apply(lambda x: format_integer_code(x, 'recipient_county_code', 3), axis=1)
    data['legal_entity_zip5'] = data.apply(lambda x: format_zip_five(x), axis=1)
    data['legal_entity_zip_last4'] = data.apply(lambda x: format_zip_four(x), axis=1)
    data['total_funding_amount'] = data.apply(lambda x: format_total_funding(x), axis=1)
    data['starting_date'] = data.apply(lambda x: format_date(x, 'starting_date'), axis=1)
    data['ending_date'] = data.apply(lambda x: format_date(x, 'ending_date'), axis=1)
    data['record_type'] = data.apply(lambda x: format_record_type(x), axis=1)
    data['principal_place_zip'] = data.apply(lambda x: format_full_zip(x), axis=1)
    data['principal_place_cd'] = data.apply(lambda x: format_cd(x, 'principal_place_cd'), axis=1)
    data['recipient_cd'] = data.apply(lambda x: format_cd(x, 'recipient_cd'), axis=1)
    data['is_historical'] = np.full(len(data.index), True, dtype=bool)
    logger.info("Starting derive_legal_entity_city_code")
    data['legal_entity_city_code'] = data.apply(lambda x: derive_legal_entity_city_code(x, sess), axis=1)
    logger.info("Starting derive_awarding_agency_code")
    data['awarding_agency_code'] = data.apply(lambda x: derive_awarding_agency_code(x, sub_tier_list), axis=1)
    logger.info("Starting derive_awarding_agency_name")
    data['awarding_agency_name'] = data.apply(lambda x: derive_awarding_agency_name(x, sub_tier_list), axis=1)
    logger.info("Starting derive_awarding_sub_tier_agency_n")
    data['awarding_sub_tier_agency_n'] = data.apply(lambda x: derive_awarding_sub_tier_agency_n(x, sub_tier_list),
                                                    axis=1)
    logger.info("Starting derive_place_of_perform_county_na")
    data['place_of_perform_county_na'] = data.apply(lambda x: derive_place_of_perform_county_na(x, sess,
                                                                                                fips_state_list,
                                                                                                state_code_list,
                                                                                                county_code_list),
                                                    axis=1)
    logger.info("Starting derive_place_of_performance_city")
    data['place_of_perform_city'] = data.apply(lambda x: derive_place_of_performance_city(x, sess, fips_state_list,
                                                                                          state_code_list), axis=1)
    logger.info("Starting derive_legal_entity_state_name")
    data['legal_entity_state_name'] = data.apply(lambda x: derive_legal_entity_state_name(x, sess, fips_state_list,
                                                                                          state_code_list), axis=1)
    logger.info("Finished derivations")

    # adding columns missing from historical data
    null_list = [
        'awarding_office_code', 'awarding_office_name', 'funding_office_name',
        'funding_agency_name', 'funding_agency_code', 'funding_office_code', 'funding_sub_tier_agency_co',
        'funding_sub_tier_agency_na', 'legal_entity_foreign_city', 'legal_entity_foreign_posta',
        'legal_entity_foreign_provi', 'place_of_performance_forei'
    ]
    for item in null_list:
        data[item] = None

    cdata = clean_data(
        data,
        PublishedAwardFinancialAssistance,
        {
            'obligation_action_date': 'action_date',
            'action_type': 'action_type',
            'assistance_type': 'assistance_type',
            'project_description': 'award_description',
            'recipient_name': 'awardee_or_recipient_legal',
            'duns_no': 'awardee_or_recipient_uniqu',
            'awarding_agency_code': 'awarding_agency_code',
            'awarding_agency_name': 'awarding_agency_name',
            'awarding_office_code': 'awarding_office_code',
            'awarding_office_name': 'awarding_office_name',
            'agency_code': 'awarding_sub_tier_agency_c',
            'awarding_sub_tier_agency_n': 'awarding_sub_tier_agency_n',
            'federal_award_mod': 'award_modification_amendme',
            'rec_flag': 'business_funds_indicator',
            'recipient_type': 'business_types',
            'cfda_program_num': 'cfda_number',
            'cfda_program_title': 'cfda_title',
            'correction_late_ind': 'correction_delete_indicatr',
            'face_loan_guran': 'face_value_loan_guarantee',
            'federal_award_id': 'fain',
            'fed_funding_amount': 'federal_action_obligation',
            'fyq_correction': 'fiscal_year_and_quarter_co',
            'funding_agency_name': 'funding_agency_name',
            'funding_agency_code': 'funding_agency_code',
            'funding_office_code': 'funding_office_code',
            'funding_office_name': 'funding_office_name',
            'funding_sub_tier_agency_co': 'funding_sub_tier_agency_co',
            'funding_sub_tier_agency_na': 'funding_sub_tier_agency_na',
            'is_historical': 'is_historical',
            'receip_addr1': 'legal_entity_address_line1',
            'receip_addr2': 'legal_entity_address_line2',
            'receip_addr3': 'legal_entity_address_line3',
            'legal_entity_city_code': 'legal_entity_city_code',
            'recipient_city_name': 'legal_entity_city_name',
            'recipient_cd': 'legal_entity_congressional',
            'recipient_country_code': 'legal_entity_country_code',
            'recipient_county_code': 'legal_entity_county_code',
            'recipient_county_name': 'legal_entity_county_name',
            'legal_entity_foreign_city': 'legal_entity_foreign_city',
            'legal_entity_foreign_posta': 'legal_entity_foreign_posta',
            'legal_entity_foreign_provi': 'legal_entity_foreign_provi',
            'recipient_state_code': 'legal_entity_state_code',
            'legal_entity_zip5': 'legal_entity_zip5',
            'legal_entity_state_name': 'legal_entity_state_name',
            'legal_entity_zip_last4': 'legal_entity_zip_last4',
            'last_modified_date': 'modified_at',
            'non_fed_funding_amount': 'non_federal_funding_amount',
            'orig_sub_guran': 'original_loan_subsidy_cost',
            'ending_date': 'period_of_performance_curr',
            'starting_date': 'period_of_performance_star',
            'principal_place_code': 'place_of_performance_code',
            'principal_place_cd': 'place_of_performance_congr',
            'principal_place_country_code': 'place_of_perform_country_c',
            'place_of_performance_forei': 'place_of_performance_forei',
            'principal_place_zip': 'place_of_performance_zip4a',
            'place_of_perform_city': 'place_of_performance_city',
            'place_of_perform_county_na': 'place_of_perform_county_na',
            'principal_place_state': 'place_of_perform_state_nam',
            'record_type': 'record_type',
            'sai_number': 'sai_number',
            'total_funding_amount': 'total_funding_amount',
            'uri': 'uri'
        }, {
            'place_of_performance_congr': {'pad_to_length': 2, 'keep_null': True},
            'legal_entity_congressional': {'pad_to_length': 2, 'keep_null': True}
        }
    )

    # make a pass through the dataframe, changing any empty values to None, to ensure that those are represented as
    # NULL in the db.
    cdata = cdata.replace(np.nan, '', regex=True)
    cdata = cdata.applymap(lambda x: str(x).strip() if len(str(x).strip()) else None)

    # generate the afa_generated_unique field
    cdata['afa_generated_unique'] = cdata.apply(lambda x: generate_unique_string(x), axis=1)

    return cdata

示例#34

0

显示文件

def clean_tas(csv_path):
    """ Read a CSV into a dataframe, then use a configured `clean_data` and return the results

        Args:
            csv_path: path of the car_tas csv to import

        Returns:
            pandas dataframe of clean data imported from the cars_tas csv
    """
    # Encoding accounts for cases where a column may include '\ufeff'
    data = pd.read_csv(csv_path, dtype=str, encoding='utf-8-sig')
    for column_mappings in [current_mappings, original_mappings]:
        try:
            data = clean_data(
                data,
                TASLookup,
                column_mappings,
                {
                    "allocation_transfer_agency": {
                        "pad_to_length": 3,
                        "keep_null": True
                    },
                    "agency_identifier": {
                        "pad_to_length": 3
                    },
                    # Account for " " cells
                    "availability_type_code": {
                        "pad_to_length": 0,
                        "keep_null": True
                    },
                    "beginning_period_of_availa": {
                        "pad_to_length": 0,
                        "keep_null": True
                    },
                    "ending_period_of_availabil": {
                        "pad_to_length": 0,
                        "keep_null": True
                    },
                    "main_account_code": {
                        "pad_to_length": 4
                    },
                    "sub_account_code": {
                        "pad_to_length": 3
                    },
                    "budget_function_code": {
                        "pad_to_length": 3,
                        "keep_null": True
                    },
                    "budget_subfunction_code": {
                        "pad_to_length": 3,
                        "keep_null": True
                    },
                    "budget_bureau_code": {
                        "pad_to_length": 2,
                        "keep_null": True
                    },
                    "reporting_agency_aid": {
                        "pad_to_length": 3,
                        "keep_null": True
                    }
                })
            break
        except ValueError as e:
            if column_mappings != original_mappings:
                logger.info(
                    'Mismatched columns, trying again with different column set'
                )
            else:
                logger.error('Encountered new column set: {}'.format(
                    data.columns))
                raise e
    data["account_num"] = pd.to_numeric(data['account_num'])
    return data.where(pd.notnull(data), None)

示例#35

0

显示文件

def parse_exec_comp_file(file_path, metrics=None):
    """ Parses the executive compensation file to update corresponding SAM records

        Args:
            file_path: the path to the SAM file
            metrics: dictionary representing metrics of the script

        Raises:
            Exception: couldn't extract the last exec comp modification date, this generally means the filename provided
                doesn't match the expected format.

        Returns:
            dataframe representing the contents in the file
    """
    if not metrics:
        metrics = {
            'files_processed': [],
            'records_received': 0,
            'records_processed': 0
        }
    logger.info('Starting file ' + file_path)

    file_name = os.path.splitext(os.path.basename(file_path))[0]
    dat_file_name = file_name + '.dat'
    file_name_props = file_name.split('_')
    dat_file_date = file_name_props[-1]
    version = 'v2' if 'V2' in file_name else 'v1'
    period = file_name_props[3]

    zfile = zipfile.ZipFile(file_path)

    v1_column_header_mapping = {
        'awardee_or_recipient_uniqu': 0,
        'sam_extract_code': 4,
        'exec_comp_str': 89
    }
    v2_column_header_mapping = {
        'uei': 0,
        'awardee_or_recipient_uniqu': 1,
        'sam_extract_code': 5,
        'exec_comp_str': 91
    }
    column_header_mapping = v1_column_header_mapping if version == 'v1' else v2_column_header_mapping
    column_header_mapping_ordered = OrderedDict(sorted(column_header_mapping.items(), key=lambda c: c[1]))
    key_col = 'awardee_or_recipient_uniqu' if version == 'v1' else 'uei'

    # can't use skipfooter, pandas' c engine doesn't work with skipfooter and the python engine doesn't work with dtype
    nrows = 0
    with zfile.open(dat_file_name) as dat_file:
        nrows = len(dat_file.readlines()) - 2  # subtract the header and footer
    with zfile.open(dat_file_name) as dat_file:
        csv_data = pd.read_csv(dat_file, dtype=str, header=None, skiprows=1, nrows=nrows, sep='|',
                               usecols=column_header_mapping_ordered.values(),
                               names=column_header_mapping_ordered.keys(), quoting=3)
    total_data = csv_data.copy()
    records_received = len(total_data.index)

    # trimming all columns before cleaning to ensure the sam_extract is working as intended
    total_data = total_data.applymap(lambda x: trim_item(x) if len(str(x).strip()) else None)

    total_data = total_data[total_data[key_col].notnull()
                            & total_data['sam_extract_code'].isin(['2', '3', 'A', 'E'])]
    records_processed = len(total_data.index)
    del total_data['sam_extract_code']

    # drop SAM duplicates, taking only the last one
    keep = 'first' if period == 'MONTHLY' else 'last'
    total_data.drop_duplicates(subset=[key_col], keep=keep, inplace=True)

    # Note: we're splitting these up cause it vastly saves memory parsing only the records that are populated
    blank_exec = total_data[total_data['exec_comp_str'].isnull()]
    pop_exec = total_data[total_data['exec_comp_str'].notnull()]

    # parse out executive compensation from row 90 for populated records
    if not pop_exec.empty:
        lambda_func = (lambda ecs: pd.Series(list(parse_exec_comp(ecs).values())))
        parsed_data = pop_exec['exec_comp_str'].apply(lambda_func)
        parsed_data.columns = list(parse_exec_comp().keys())
        pop_exec = pd.concat([pop_exec, parsed_data], axis=1)
    else:
        pop_exec = pop_exec.assign(**parse_exec_comp())

    # leave blanks
    blank_exec = blank_exec.assign(**parse_exec_comp())

    # setup the final dataframe
    total_data = pd.concat([pop_exec, blank_exec])
    del total_data['exec_comp_str']
    total_data.replace('', np.nan, inplace=True)
    last_exec_comp_mod_date = datetime.datetime.strptime(dat_file_date, '%Y%m%d').date()
    total_data = total_data.assign(last_exec_comp_mod_date=last_exec_comp_mod_date)

    # Cleaning out any untrimmed strings
    if not total_data.empty:
        exec_comp_maps = {
            'awardee_or_recipient_uniqu': 'awardee_or_recipient_uniqu',
            'high_comp_officer1_amount': 'high_comp_officer1_amount',
            'high_comp_officer1_full_na': 'high_comp_officer1_full_na',
            'high_comp_officer2_amount': 'high_comp_officer2_amount',
            'high_comp_officer2_full_na': 'high_comp_officer2_full_na',
            'high_comp_officer3_amount': 'high_comp_officer3_amount',
            'high_comp_officer3_full_na': 'high_comp_officer3_full_na',
            'high_comp_officer4_amount': 'high_comp_officer4_amount',
            'high_comp_officer4_full_na': 'high_comp_officer4_full_na',
            'high_comp_officer5_amount': 'high_comp_officer5_amount',
            'high_comp_officer5_full_na': 'high_comp_officer5_full_na',
            'last_exec_comp_mod_date': 'last_exec_comp_mod_date'
        }
        if version == 'v2':
            exec_comp_maps['uei'] = 'uei'
        total_data = clean_data(total_data, SAMRecipient, exec_comp_maps, {})
        total_data.drop(columns=['created_at', 'updated_at'], inplace=True)

    metrics['files_processed'].append(dat_file_name)
    metrics['records_received'] += records_received
    metrics['records_processed'] += records_processed

    return total_data

示例#36

0

显示文件

def load_cfda_program(base_path, load_local=False, local_file_name="cfda_program.csv"):
    """ Load cfda program.

        Args:
            base_path: directory that contains the cfda values files.
            load_local: boolean indicating whether to load from a local file or not
            local_file_name: the name of the file if loading locally
    """
    local_now = datetime.now()
    if not load_local:
        logger.info("Fetching CFDA file from {}".format(S3_CFDA_FILE))
        tmp_name = str(time.time()).replace(".", "") + "_cfda_program.csv"
        filename = os.path.join(base_path, tmp_name)
        r = requests.get(S3_CFDA_FILE, allow_redirects=True)
        open(filename, 'wb').write(r.content)
    else:
        filename = os.path.join(base_path, local_file_name)
    logger.info('Loading CFDA program file: ' + filename)
    model = CFDAProgram

    metrics_json = {
        'script_name': 'load_cfda_data.py',
        'start_time': str(local_now),
        'new_records': 0
    }

    def fix_program_number(row, decimals=3):
        multiplier = 10 ** decimals
        value = math.floor(row['program_number'] * multiplier + 0.5) / multiplier
        return str(value).ljust(6, '0')

    with create_app().app_context():
        configure_logging()
        sess = GlobalDB.db().session

        import_data = pd.read_csv(filename, dtype=str, encoding='cp1252', na_filter=False)
        import_data = clean_data(
            import_data,
            model,
            DATA_CLEANING_MAP,
            {}
        )
        import_data["published_date"] = format_date(import_data["published_date"])
        import_data["archived_date"] = format_date(import_data["archived_date"])
        table_name = model.__table__.name
        # Check if there is new data to load
        new_data = check_dataframe_diff(import_data, model, ['cfda_program_id'], ['program_number'],
                                        lambda_funcs=[('program_number', fix_program_number)])
        if new_data:
            # insert to db
            sess.query(model).delete()
            num = insert_dataframe(import_data, table_name, sess.connection())
            sess.commit()

            # If we've updated the data at all, update the external data load date
            update_external_data_load_date(local_now, datetime.now(), 'cfda')
    if not load_local:
        os.remove(filename)
    if new_data:
        logger.info('{} records inserted to {}'.format(num, table_name))
        metrics_json['new_records'] = num
    else:
        logger.info("Skipped cfda load, no new data.")
        sys.exit(3)

    metrics_json['duration'] = str(datetime.now() - local_now)

    with open('load_cfda_data_metrics.json', 'w+') as metrics_file:
        json.dump(metrics_json, metrics_file)

示例#37

0

显示文件

文件： load_program_activity.py 项目： fedspendingtransparency/data-act-broker-backend

def load_program_activity_data(base_path, force_reload=False, export=False):
    """ Load program activity lookup table.

        Args:
            base_path: directory of domain config files
            force_reload: whether or not to force a reload
            export: whether or not to export a public copy of the file
    """
    now = datetime.datetime.now()
    metrics_json = {
        'script_name': 'load_program_activity.py',
        'start_time': str(now),
        'records_received': 0,
        'duplicates_dropped': 0,
        'invalid_records_dropped': 0,
        'records_deleted': 0,
        'records_inserted': 0
    }
    dropped_count = 0

    logger.info('Checking PA upload dates to see if we can skip.')
    last_upload = get_date_of_current_pa_upload(base_path)
    if not (last_upload > get_stored_pa_last_upload()) and not force_reload:
        logger.info('Skipping load as it\'s already been done')
    else:
        logger.info('Getting the progrma activity file')
        program_activity_file = get_program_activity_file(base_path)

        logger.info('Loading program activity: {}'.format(PA_FILE_NAME))

        with create_app().app_context():
            sess = GlobalDB.db().session
            try:
                raw_data = pd.read_csv(program_activity_file, dtype=str)
            except pd.io.common.EmptyDataError:
                log_blank_file()
                exit_if_nonlocal(4)  # exit code chosen arbitrarily, to indicate distinct failure states
                return
            headers = set([header.upper() for header in list(raw_data)])

            if not VALID_HEADERS.issubset(headers):
                logger.error('Missing required headers. Required headers include: %s' % str(VALID_HEADERS))
                exit_if_nonlocal(4)
                return

            try:
                dropped_count, data = clean_data(
                    raw_data,
                    ProgramActivity,
                    {'fyq': 'fiscal_year_period', 'agency_code': 'agency_id', 'allocation_id': 'allocation_transfer_id',
                     'account_code': 'account_number', 'pa_code': 'program_activity_code',
                     'pa_title': 'program_activity_name'},
                    {'program_activity_code': {'pad_to_length': 4}, 'agency_id': {'pad_to_length': 3},
                     'allocation_transfer_id': {'pad_to_length': 3, 'keep_null': True},
                     'account_number': {'pad_to_length': 4}},
                    ['agency_id', 'program_activity_code', 'account_number', 'program_activity_name'],
                    True
                )
            except FailureThresholdExceededException as e:
                if e.count == 0:
                    log_blank_file()
                    exit_if_nonlocal(4)
                    return
                else:
                    logger.error('Loading of program activity file failed due to exceeded failure threshold. '
                                 'Application tried to drop {} rows'.format(e.count))
                    exit_if_nonlocal(5)
                    return

            metrics_json['records_deleted'] = sess.query(ProgramActivity).delete()
            metrics_json['invalid_records_dropped'] = dropped_count

            # Lowercase Program Activity Name
            data['program_activity_name'] = data['program_activity_name'].apply(lambda x: lowercase_or_notify(x))
            # Convert FYQ to FYP
            data['fiscal_year_period'] = data['fiscal_year_period'].apply(lambda x: convert_fyq_to_fyp(x))

            # because we're only loading a subset of program activity info, there will be duplicate records in the
            # dataframe. this is ok, but need to de-duped before the db load. We also need to log them.
            base_count = len(data.index)
            metrics_json['records_received'] = base_count
            data.drop_duplicates(inplace=True)

            dupe_count = base_count - len(data.index)
            logger.info('Dropped {} duplicate rows.'.format(dupe_count))
            metrics_json['duplicates_dropped'] = dupe_count

            # insert to db
            table_name = ProgramActivity.__table__.name
            num = insert_dataframe(data, table_name, sess.connection())
            sess.commit()

            if export:
                export_public_pa(raw_data)

        end_time = datetime.datetime.now()
        update_external_data_load_date(now, end_time, 'program_activity')
        update_external_data_load_date(last_upload, end_time, 'program_activity_upload')
        logger.info('{} records inserted to {}'.format(num, table_name))
        metrics_json['records_inserted'] = num

        metrics_json['duration'] = str(end_time - now)

    with open('load_program_activity_metrics.json', 'w+') as metrics_file:
        json.dump(metrics_json, metrics_file)

    if dropped_count > 0:
        exit_if_nonlocal(3)
        return