def update_state_congr_table_census(census_file, sess):
    """ Update contents of state_congressional table to include districts from the census

        Args:
            census_file: file path/url to the census file to read
            sess: the database connection
    """
    logger.info(
        "Adding congressional districts from census to the state_congressional table"
    )

    data = pd.read_csv(census_file, dtype=str)
    model = StateCongressional

    data = clean_data(
        data, model, {
            "state_code": "state_code",
            "congressional_district_no": "congressional_district_no",
            "census_year": "census_year"
        }, {'congressional_district_no': {
            "pad_to_length": 2
        }})

    table_name = model.__table__.name
    insert_dataframe(data, table_name, sess.connection())
    sess.commit()
def create_temp_exec_comp_table(sess, table_name, data):
    """ Creates a temporary executive compensation table with the given name and data.

        Args:
            sess: database connection
            table_name: what to name the table being created
            data: pandas dataframe representing exec comp data
    """
    logger.info('Making {} table'.format(table_name))
    create_table_sql = """
            CREATE TABLE IF NOT EXISTS {} (
                awardee_or_recipient_uniqu TEXT,
                high_comp_officer1_amount TEXT,
                high_comp_officer1_full_na TEXT,
                high_comp_officer2_amount TEXT,
                high_comp_officer2_full_na TEXT,
                high_comp_officer3_amount TEXT,
                high_comp_officer3_full_na TEXT,
                high_comp_officer4_amount TEXT,
                high_comp_officer4_full_na TEXT,
                high_comp_officer5_amount TEXT,
                high_comp_officer5_full_na TEXT,
                last_exec_comp_mod_date DATE
            );
        """.format(table_name)
    sess.execute(create_table_sql)
    # Truncating in case we didn't clear out this table after a failure in the script
    sess.execute('TRUNCATE TABLE {};'.format(table_name))
    insert_dataframe(data, table_name, sess.connection())
Exemplo n.º 3
0
def create_temp_sam_recipient_table(sess, table_name, data):
    """ Creates a temporary SAM table with the given name and data.

        Args:
            sess: database connection
            table_name: what to name the table being created
            data: pandas dataframe representing SAM data
    """
    logger.info('Making {} table'.format(table_name))
    column_types = {
        'created_at': 'TIMESTAMP WITHOUT TIME ZONE',
        'updated_at': 'TIMESTAMP WITHOUT TIME ZONE',
        'uei': 'TEXT',
        'awardee_or_recipient_uniqu': 'TEXT',
        'activation_date': 'DATE',
        'expiration_date': 'DATE',
        'deactivation_date': 'DATE',
        'registration_date': 'DATE',
        'last_sam_mod_date': 'DATE',
        'legal_business_name': 'TEXT',
        'dba_name': 'TEXT',
        'ultimate_parent_uei': 'TEXT',
        'ultimate_parent_unique_ide': 'TEXT',
        'ultimate_parent_legal_enti': 'TEXT',
        'address_line_1': 'TEXT',
        'address_line_2': 'TEXT',
        'city': 'TEXT',
        'state': 'TEXT',
        'zip': 'TEXT',
        'zip4': 'TEXT',
        'country_code': 'TEXT',
        'congressional_district': 'TEXT',
        'business_types_codes': 'TEXT[]',
        'business_types': 'TEXT[]',
        'entity_structure': 'TEXT',
        'high_comp_officer1_amount': 'TEXT',
        'high_comp_officer1_full_na': 'TEXT',
        'high_comp_officer2_amount': 'TEXT',
        'high_comp_officer2_full_na': 'TEXT',
        'high_comp_officer3_amount': 'TEXT',
        'high_comp_officer3_full_na': 'TEXT',
        'high_comp_officer4_amount': 'TEXT',
        'high_comp_officer4_full_na': 'TEXT',
        'high_comp_officer5_amount': 'TEXT',
        'high_comp_officer5_full_na': 'TEXT',
        'last_exec_comp_mod_date': 'DATE'
    }
    columns = ', '.join(['{} {}'.format(column_name, column_type) for column_name, column_type in column_types.items()
                         if column_name in list(data.columns)])
    create_table_sql = 'CREATE TABLE IF NOT EXISTS {} ({});'.format(table_name, columns)
    sess.execute(create_table_sql)
    # Truncating in case we didn't clear out this table after a failure in the script
    sess.execute('TRUNCATE TABLE {};'.format(table_name))
    insert_dataframe(data, table_name, sess.connection())
Exemplo n.º 4
0
def run_duns_batches(file, sess, client, block_size=10000):
    """ Updates DUNS table in chunks from csv file

        Args:
            file: path to the DUNS export file to use
            sess: the database connection
            client: the connection to the SAM service
            block_size: the size of the batches to read from the DUNS export file.
    """
    logger.info("Retrieving total rows from duns file")
    start = datetime.now()
    duns_reader_obj = pd.read_csv(file,
                                  skipinitialspace=True,
                                  header=None,
                                  quotechar='"',
                                  dtype=str,
                                  names=column_headers,
                                  iterator=True,
                                  chunksize=block_size,
                                  skiprows=1)
    duns_dfs = [duns_df for duns_df in duns_reader_obj]
    row_count = sum([len(duns_df.index) for duns_df in duns_dfs])
    logger.info("Retrieved row count of {} in {} s".format(
        row_count, (datetime.now() - start).total_seconds()))

    duns_added = 0
    for duns_df in duns_dfs:
        # Remove rows where awardee_or_recipient_uniqu is null
        duns_df = duns_df[duns_df['awardee_or_recipient_uniqu'].notnull()]
        # Ignore old DUNS we already have
        duns_to_load = remove_existing_duns(duns_df, sess)

        if not duns_to_load.empty:
            logger.info("Adding {} DUNS records from historic data".format(
                len(duns_to_load.index)))
            start = datetime.now()

            # get address info for incoming duns
            duns_to_load = update_duns_props(duns_to_load, client)
            duns_to_load = clean_data(duns_to_load, HistoricDUNS,
                                      column_mappings, {})
            duns_added += len(duns_to_load.index)

            insert_dataframe(duns_to_load, HistoricDUNS.__table__.name,
                             sess.connection())
            sess.commit()

            logger.info("Finished updating {} DUNS rows in {} s".format(
                len(duns_to_load.index),
                (datetime.now() - start).total_seconds()))

    logger.info("Imported {} historical duns".format(duns_added))
def parse_sam_file(file, sess):
    logger.info("starting file " + str(file.name))

    csv_file = os.path.splitext(os.path.basename(file.name))[0]+'.dat'
    zfile = zipfile.ZipFile(file.name)

    # can't use skipfooter, pandas' c engine doesn't work with skipfooter and the python engine doesn't work with dtype
    nrows = 0
    with zfile.open(csv_file) as f:
        nrows = len(f.readlines()) - 2  # subtract the header and footer
    column_header_mapping = {
        "awardee_or_recipient_uniqu": 0,
        "sam_extract": 4,
        "expiration_date": 7,
        "activation_date": 9,
        "ultimate_parent_legal_enti": 10,
        "ultimate_parent_unique_ide": 48,
        "exec_comp_str": 89
    }
    column_header_mapping_ordered = OrderedDict(sorted(column_header_mapping.items(), key=lambda c: c[1]))
    csv_data = pd.read_csv(zfile.open(csv_file), dtype=str, header=None, skiprows=1, nrows=nrows, sep='|',
                           usecols=column_header_mapping_ordered.values(), names=column_header_mapping_ordered.keys())
    total_data = csv_data.copy()

    # skipping when sam_extract == '4' as it's expired
    total_data = total_data[total_data.sam_extract != '4']

    # parse out executive compensation from row 90
    lambda_func = (lambda ecs: pd.Series(list(parse_exec_comp(ecs).values())))
    parsed_data = total_data["exec_comp_str"].apply(lambda_func)
    parsed_data.columns = list(parse_exec_comp().keys())
    del total_data["exec_comp_str"]
    total_data = total_data.join(parsed_data)

    # split into 3 dataframes based on row 8 ('1', '2', '3')
    delete_data = total_data[total_data.sam_extract == '1'].replace(np.nan, "", regex=True)
    add_data = total_data[total_data.sam_extract == '2'].replace(np.nan, "", regex=True)
    update_data = total_data[total_data.sam_extract == '3'].replace(np.nan, "", regex=True)
    for dataframe in [add_data, update_data, delete_data, total_data]:
        del dataframe["sam_extract"]

    table_name = ExecutiveCompensation.__table__.name
    insert_dataframe(add_data, table_name, sess.connection())
    for _, row in update_data.iterrows():
        sess.query(ExecutiveCompensation).filter_by(awardee_or_recipient_uniqu=row['awardee_or_recipient_uniqu']).\
            update(row, synchronize_session=False)
    for _, row in delete_data.iterrows():
        sess.query(ExecutiveCompensation).filter_by(awardee_or_recipient_uniqu=row['awardee_or_recipient_uniqu']).\
            delete(synchronize_session=False)
    sess.commit()
def load_zip_city_data(force_reload):
    """ Load data into the ZipCity table

        Args:
            force_reload: boolean to determine if reload should happen whether there are differences or not
    """
    if CONFIG_BROKER["use_aws"]:
        s3_client = boto3.client('s3', region_name=CONFIG_BROKER['aws_region'])
        citystate_file = s3_client.generate_presigned_url('get_object', {'Bucket': CONFIG_BROKER['sf_133_bucket'],
                                                                         'Key': "ctystate.txt"}, ExpiresIn=600)
        zip_city_file = urllib.request.urlopen(citystate_file)
    else:
        citystate_file = os.path.join(CONFIG_BROKER["path"], "dataactvalidator", "config", "ctystate.txt")
        zip_city_file = open(citystate_file)

    new_data = parse_zip_city_file(zip_city_file)

    diff_found = check_dataframe_diff(new_data, ZipCity, ['zip_city_id'], ['zip_code'])

    if force_reload or diff_found:
        sess = GlobalDB.db().session
        logger.info('Differences found or reload forced, reloading zip_city table.')
        # delete any data in the ZipCity table
        sess.query(ZipCity).delete()

        # insert data into table
        num = insert_dataframe(new_data, ZipCity.__table__.name, sess.connection())
        logger.info('{} records inserted to zip_city'.format(num))
        sess.commit()
    else:
        logger.info('No differences found, skipping zip_city table reload.')
def load_state_data(force_reload):
    """ Load data into the States table

        Args:
            force_reload: boolean to determine if reload should happen whether there are differences or not
    """
    start_time = datetime.now()
    state_file_url = '{}/state_list.csv'.format(CONFIG_BROKER['usas_public_reference_url'])
    with RetrieveFileFromUri(state_file_url, 'r').get_file_object() as state_file:
        new_data = parse_state_file(state_file)

    diff_found = check_dataframe_diff(new_data, States, ['states_id'], ['state_code'])

    if force_reload or diff_found:
        sess = GlobalDB.db().session
        logger.info('Differences found or reload forced, reloading states table.')
        # delete any data in the States table
        sess.query(States).delete()

        # insert data into table
        num = insert_dataframe(new_data, States.__table__.name, sess.connection())
        logger.info('{} records inserted to states'.format(num))
        sess.commit()
        update_external_data_load_date(start_time, datetime.now(), 'state_code')
    else:
        logger.info('No differences found, skipping states table reload.')
def parse_county_file(county_file, sess):
    """ Parse the County file and insert all relevant rows into the database.

        Args:
            county_file: path/url to file to gather County data from
            sess: database session
    """
    # read the data and clean up the column names
    data = pd.read_csv(county_file, dtype=str, sep="|")
    data = clean_data(
        data,
        {"COUNTY_NUMERIC": "county_number",
         "COUNTY_NAME": "county_name",
         "STATE_ALPHA": "state_code"})

    # remove all blank county_number rows. Not much use in a county number table
    data = data[pd.notnull(data['county_number'])]

    # remove duplicates because we have no use for them (there may be none, this is a precaution)
    data = data[~data.duplicated(subset=['county_number', 'state_code'], keep='first')]

    # add created_at and updated_at columns
    now = datetime.utcnow()
    data = data.assign(created_at=now, updated_at=now)

    # insert data into table
    num = insert_dataframe(data, CountyCode.__table__.name, sess.connection())
    logger.info('{} records inserted to county_code'.format(num))
    sess.commit()
def load_county_data(county_file, force_reload):
    """ Load data into the CountyCode table

        Args:
            county_file: path/url to file to gather County data from
            force_reload: boolean to determine if reload should happen whether there are differences or not
    """
    new_data = parse_county_file(county_file)

    diff_found = check_dataframe_diff(new_data, CountyCode, 'county_code_id',
                                      ['county_number', 'state_code'])

    if force_reload or diff_found:
        sess = GlobalDB.db().session
        logger.info(
            'Differences found or reload forced, reloading county_code table.')
        # delete any data in the CountyCode table
        sess.query(CountyCode).delete()

        # insert data into table
        num = insert_dataframe(new_data, CountyCode.__table__.name,
                               sess.connection())
        logger.info('{} records inserted to county_code'.format(num))
        sess.commit()
    else:
        logger.info('No differences found, skipping county_code table reload.')
def parse_county_file(county_file, sess):
    """ Parse the County file and insert all relevant rows into the database.

        Args:
            county_file: path/url to file to gather County data from
            sess: database session
    """
    # read the data and clean up the column names
    data = pd.read_csv(county_file, dtype=str, sep="|")
    data = clean_data(
        data, {
            "COUNTY_NUMERIC": "county_number",
            "COUNTY_NAME": "county_name",
            "STATE_ALPHA": "state_code"
        })

    # remove all blank county_number rows. Not much use in a county number table
    data = data[pd.notnull(data['county_number'])]

    # remove duplicates because we have no use for them (there may be none, this is a precaution)
    data = data[
        ~data.duplicated(subset=['county_number', 'state_code'], keep='first')]

    # add created_at and updated_at columns
    now = datetime.utcnow()
    data = data.assign(created_at=now, updated_at=now)

    # insert data into table
    num = insert_dataframe(data, CountyCode.__table__.name, sess.connection())
    logger.info('{} records inserted to county_code'.format(num))
    sess.commit()
Exemplo n.º 11
0
def load_country_codes(base_path):
    """ Load Country Codes into the database.

        Args:
            base_path: directory that contains the domain values files.
    """
    now = datetime.datetime.now()
    metrics_json = {
        'script_name': 'load_country_codes.py',
        'start_time': str(now),
        'records_deleted': 0,
        'records_provided': 0,
        'duplicates_dropped': 0,
        'records_inserted': 0
    }
    if CONFIG_BROKER["use_aws"]:
        s3_client = boto3.client('s3', region_name=CONFIG_BROKER['aws_region'])
        filename = s3_client.generate_presigned_url(
            'get_object', {
                'Bucket': CONFIG_BROKER['sf_133_bucket'],
                'Key': "country_codes.csv"
            },
            ExpiresIn=600)
    else:
        filename = os.path.join(base_path, "country_codes.csv")

    logger.info('Loading country codes file: country_codes.csv')

    with create_app().app_context():
        sess = GlobalDB.db().session

        # for object class, delete and replace values
        metrics_json['records_deleted'] = sess.query(CountryCode).delete()

        data = pd.read_csv(filename, dtype=str)
        metrics_json['records_provided'] = len(data.index)
        data = clean_data(data, CountryCode, {
            "country_code": "country_code",
            "country_name": "country_name"
        }, {})
        # de-dupe
        data.drop_duplicates(subset=['country_code'], inplace=True)
        metrics_json['duplicates_dropped'] = metrics_json[
            'records_provided'] - len(data.index)
        # flag territories or freely associated states
        data["territory_free_state"] = np.where(
            data["country_code"].isin(TERRITORIES_FREE_STATES), True, False)
        # insert to db
        table_name = CountryCode.__table__.name
        num = insert_dataframe(data, table_name, sess.connection())
        metrics_json['records_inserted'] = num
        sess.commit()

    logger.info('{} records inserted to {}'.format(num, table_name))

    metrics_json['duration'] = str(datetime.datetime.now() - now)

    with open('load_country_codes_metrics.json', 'w+') as metrics_file:
        json.dump(metrics_json, metrics_file)
    logger.info("Script complete")
def load_zip_city_data(zip_city_file, force_reload):
    """ Load data into the ZipCity table

        Args:
            zip_city_file: path/url to file to gather ZipCity data from
            force_reload: boolean to determine if reload should happen whether there are differences or not
    """
    new_data = parse_zip_city_file(zip_city_file)

    diff_found = check_dataframe_diff(new_data, ZipCity, 'zip_city_id',
                                      ['zip_code'])

    if force_reload or diff_found:
        sess = GlobalDB.db().session
        logger.info(
            'Differences found or reload forced, reloading zip_city table.')
        # delete any data in the ZipCity table
        sess.query(ZipCity).delete()

        # insert data into table
        num = insert_dataframe(new_data, ZipCity.__table__.name,
                               sess.connection())
        logger.info('{} records inserted to zip_city'.format(num))
        sess.commit()
    else:
        logger.info('No differences found, skipping zip_city table reload.')
def load_object_class(base_path):

    """ This function loads Object classes into the database

        Args:
            base_path: directory that contains the domain values files.
    """
    if CONFIG_BROKER["use_aws"]:
        s3_client = boto3.client('s3', region_name=CONFIG_BROKER['aws_region'])
        filename = s3_client.generate_presigned_url('get_object', {'Bucket': CONFIG_BROKER['sf_133_bucket'],
                                                                   'Key': "object_class.csv"}, ExpiresIn=600)
    else:
        filename = os.path.join(base_path, "object_class.csv")

    # Load object class lookup table
    logger.info('Loading Object Class File: object_class.csv')
    with create_app().app_context():
        sess = GlobalDB.db().session
        sess.query(ObjectClass).delete()

        data = pd.read_csv(filename, dtype=str)
        data = clean_data(
            data,
            ObjectClass,
            {"max_oc_code": "object_class_code", "max_object_class_name": "object_class_name"},
            {"object_class_code": {"pad_to_length": 3}}
        )
        # de-dupe
        data.drop_duplicates(subset=['object_class_code'], inplace=True)
        # insert to db
        table_name = ObjectClass.__table__.name
        num = insert_dataframe(data, table_name, sess.connection())
        sess.commit()

    logger.info('{} records inserted to {}'.format(num, table_name))
Exemplo n.º 14
0
def load_quarterly_threshold():
    """ Loads the quarterly revalidation threshold data. """
    if CONFIG_BROKER["use_aws"]:
        s3_client = boto3.client('s3', region_name=CONFIG_BROKER['aws_region'])
        threshold_file = s3_client.generate_presigned_url('get_object', {'Bucket': CONFIG_BROKER['sf_133_bucket'],
                                                                         'Key': "quarterly_submission_dates.csv"},
                                                          ExpiresIn=600)
    else:
        threshold_file = os.path.join(CONFIG_BROKER["path"], "dataactvalidator", "config",
                                      "quarterly_submission_dates.csv")

    logger.info('Loading quarterly revalidation threshold data')
    with create_app().app_context():
        data = pd.read_csv(threshold_file, dtype=str)

        data = clean_data(
            data,
            QuarterlyRevalidationThreshold,
            {"year": "year", "quarter": "quarter", "window_start": "window_start", "window_end": "window_end"},
            {}
        )

        sess = GlobalDB.db().session
        # delete any data in the QuarterlyRevalidationThreshold table
        sess.query(QuarterlyRevalidationThreshold).delete()

        # insert data into table
        num = insert_dataframe(data, QuarterlyRevalidationThreshold.__table__.name, sess.connection())
        logger.info('{} records inserted to quarterly_revalidation_threshold'.format(num))
        sess.commit()
Exemplo n.º 15
0
def load_sf133(sess, filename, fiscal_year, fiscal_period, force_sf133_load=False, metrics=None):
    """ Load SF 133 (budget execution report) lookup table.

        Args:
            sess: connection to database
            filename: name/path of the file to read in
            fiscal_year: fiscal year of the file being loaded
            fiscal_period: fiscal period of the file being loaded
            force_sf133_load: boolean to indicate whether to force a reload of the data
            metrics: an object containing information for the metrics file
    """
    if not metrics:
        metrics = {}

    existing_records = sess.query(SF133).filter(SF133.fiscal_year == fiscal_year, SF133.period == fiscal_period)
    if force_sf133_load:
        # force a reload of this period's current data
        logger.info('Force SF 133 load: deleting existing records for %s %s', fiscal_year, fiscal_period)
        delete_count = existing_records.delete()
        logger.info('%s records deleted', delete_count)
        metrics['records_deleted'] += delete_count
    elif existing_records.count():
        # if there's existing data & we're not forcing a load, skip
        logger.info('SF133 %s %s already in database (%s records). Skipping file.', fiscal_year, fiscal_period,
                    existing_records.count())
        return

    data = clean_sf133_data(filename, SF133)

    # Now that we've added zero lines for EVERY tas and SF 133 line number, get rid of the ones we don't actually
    # use in the validations. Arguably, it would be better just to include everything, but that drastically
    # increases the number of records we're inserting to the sf_133 table. If we ever decide that we need *all*
    # SF 133 lines that are zero value, remove the next two lines.
    sf_133_validation_lines = [
        '1000', '1010', '1011', '1012', '1013', '1020', '1021', '1022', '1023', '1024', '1025', '1026', '1029',
        '1030', '1031', '1032', '1033', '1040', '1041', '1042', '1160', '1180', '1260', '1280', '1340', '1440',
        '1540', '1640', '1750', '1850', '1910', '2190', '2490', '2500', '3020', '4801', '4802', '4881', '4882',
        '4901', '4902', '4908', '4981', '4982'
    ]
    data = data[(data.line.isin(sf_133_validation_lines)) | (data.amount != 0)]

    # we didn't use the the 'keep_null' option when padding allocation transfer agency, because nulls in that column
    # break the pivot (see above comments). so, replace the ata '000' with an empty value before inserting to db
    data['allocation_transfer_agency'] = data['allocation_transfer_agency'].str.replace('000', '')
    # make a pass through the dataframe, changing any empty values to None, to ensure that those are represented as
    # NULL in the db.
    data = data.applymap(lambda x: str(x).strip() if len(str(x).strip()) else None)

    # Keeping display_tas out here as it depends on empty allocation_transfer_agency being None and not 000
    data['display_tas'] = data.apply(lambda row: concat_display_tas_dict(row), axis=1)

    # insert to db
    table_name = SF133.__table__.name
    num = insert_dataframe(data, table_name, sess.connection())
    metrics['records_inserted'] += num
    update_account_num(int(fiscal_year), int(fiscal_period))
    sess.commit()

    logger.info('%s records inserted to %s', num, table_name)
def insert_file(filename, submission_id, file_type_id, csv_schema, long_to_short_dict):
    """ Insert the data from the file into the corresponding Certified table.

        Params:
            filename: filename to load
            submission_id: Database ID for the submission being loaded
            file_type_id:  Database file type ID for files A, B, or C
            csv_schema: Schema built for this filetype's
            long_to_short_dict: Dict to translate long column names to the column names used by the database
    """
    sess = GlobalDB.db().session
    logger.info('Copying "{}" into {} table'.format(filename, FTI_TABLENAME_DICT[file_type_id]))

    # If this is a file in S3, download to a local temp file first then use temp file as local file
    if CONFIG_BROKER['use_aws']:
        (file, tmp_filename) = tempfile.mkstemp()
        s3 = boto3.client('s3', region_name=CONFIG_BROKER['aws_region'])
        s3.download_file(CONFIG_BROKER['certified_bucket'], filename, tmp_filename)
        filename = tmp_filename

    with open(filename) as file:
        # Get file delimiter and reset reader to start of file
        delim = '|' if file.readline().count('|') != 0 else ','
        file.seek(0)

        # Create dataframe from file
        data = pd.read_csv(file, dtype=str, delimiter=delim)

    # Only use the columns needed for the DB table
    data = data.rename(columns=lambda x: x.lower().strip())
    data = data.rename(index=str, columns=long_to_short_dict)
    data = data[list(csv_schema.keys())]

    # Clean rows
    if len(data.index) > 0:
        for col in long_to_short_dict.values():
            data[col] = data.apply(lambda x: clean_col(x, col, file_type_id, csv_schema), axis=1)

    # Populate columns that aren't in the file
    if len(data.index) > 0:
        data['tas'] = data.apply(lambda x: format_internal_tas(x), axis=1)
    now = datetime.datetime.now()
    data['created_at'] = now
    data['updated_at'] = now
    data['submission_id'] = submission_id
    job = sess.query(Job).filter_by(submission_id=submission_id, file_type_id=file_type_id,
                                    job_type_id=JOB_TYPE_DICT['csv_record_validation']).one()
    data['job_id'] = job.job_id
    data = data.reset_index()
    data['row_number'] = data.index + 2
    data = data.drop(['index'], axis=1)

    # Load dataframe into the DB table
    count = insert_dataframe(data, FTI_TABLE_DICT[file_type_id].__table__.name, sess.connection())
    sess.commit()

    logger.info('Loaded {} records into the {} table'.format(count, FTI_TABLENAME_DICT[file_type_id]))
Exemplo n.º 17
0
def load_object_class(base_path):
    """ This function loads Object classes into the database

        Args:
            base_path: directory that contains the domain values files.
    """
    now = datetime.datetime.now()
    metrics_json = {
        'script_name': 'load_object_class.py',
        'start_time': str(now),
        'records_received': 0,
        'duplicates_dropped': 0,
        'records_deleted': 0,
        'records_inserted': 0
    }
    if CONFIG_BROKER["use_aws"]:
        s3_client = boto3.client('s3', region_name=CONFIG_BROKER['aws_region'])
        filename = s3_client.generate_presigned_url(
            'get_object', {
                'Bucket': CONFIG_BROKER['sf_133_bucket'],
                'Key': "object_class.csv"
            },
            ExpiresIn=600)
    else:
        filename = os.path.join(base_path, "object_class.csv")

    # Load object class lookup table
    logger.info('Loading Object Class File: object_class.csv')
    with create_app().app_context():
        sess = GlobalDB.db().session
        metrics_json['records_deleted'] = sess.query(ObjectClass).delete()

        data = pd.read_csv(filename, dtype=str)
        data = clean_data(
            data, ObjectClass, {
                "max_oc_code": "object_class_code",
                "max_object_class_name": "object_class_name"
            }, {"object_class_code": {
                "pad_to_length": 3
            }})
        metrics_json['records_received'] = len(data.index)
        # de-dupe
        data.drop_duplicates(subset=['object_class_code'], inplace=True)
        metrics_json['duplicates_dropped'] = metrics_json[
            'records_received'] - len(data.index)
        # insert to db
        table_name = ObjectClass.__table__.name
        num = insert_dataframe(data, table_name, sess.connection())
        sess.commit()

    logger.info('{} records inserted to {}'.format(num, table_name))
    metrics_json['records_inserted'] = num

    metrics_json['duration'] = str(datetime.datetime.now() - now)

    with open('load_object_class_metrics.json', 'w+') as metrics_file:
        json.dump(metrics_json, metrics_file)
Exemplo n.º 18
0
def load_sf133(filename, fiscal_year, fiscal_period, force_sf133_load=False):
    """Load SF 133 (budget execution report) lookup table."""

    with create_app().app_context():
        sess = GlobalDB.db().session

        existing_records = sess.query(SF133).filter(
            SF133.fiscal_year == fiscal_year, SF133.period == fiscal_period)
        if force_sf133_load:
            # force a reload of this period's current data
            logger.info(
                'Force SF 133 load: deleting existing records for %s %s',
                fiscal_year, fiscal_period)
            delete_count = existing_records.delete()
            logger.info('%s records deleted', delete_count)
        elif existing_records.count():
            # if there's existing data & we're not forcing a load, skip
            logger.info(
                'SF133 %s %s already in database (%s records). Skipping file.',
                fiscal_year, fiscal_period, existing_records.count())
            return

        data = clean_sf133_data(filename, SF133)

        # Now that we've added zero lines for EVERY tas and SF 133 line number, get rid of the ones
        # we don't actually use in the validations. Arguably, it would be better just to include
        # everything, but that drastically increases the number of records we're inserting to the
        # sf_133 table. If we ever decide that we need *all* SF 133 lines that are zero value,
        # remove the next two lines.
        sf_133_validation_lines = [
            '1000', '1010', '1011', '1012', '1013', '1020', '1021', '1022',
            '1023', '1024', '1025', '1026', '1029', '1030', '1031', '1032',
            '1033', '1040', '1041', '1042', '1160', '1180', '1260', '1280',
            '1340', '1440', '1540', '1640', '1750', '1850', '1910', '2190',
            '2490', '2500', '3020', '4801', '4802', '4881', '4882', '4901',
            '4902', '4908', '4981', '4982'
        ]
        data = data[(data.line.isin(sf_133_validation_lines)) |
                    (data.amount != 0)]

        # we didn't use the the 'keep_null' option when padding allocation transfer agency,
        # because nulls in that column break the pivot (see above comments).
        # so, replace the ata '000' with an empty value before inserting to db
        data['allocation_transfer_agency'] = data[
            'allocation_transfer_agency'].str.replace('000', '')
        # make a pass through the dataframe, changing any empty values to None, to ensure
        # that those are represented as NULL in the db.
        data = data.applymap(lambda x: str(x).strip()
                             if len(str(x).strip()) else None)

        # insert to db
        table_name = SF133.__table__.name
        num = insert_dataframe(data, table_name, sess.connection())
        update_tas_id(int(fiscal_year), int(fiscal_period))
        sess.commit()

    logger.info('%s records inserted to %s', num, table_name)
    def load_sql(cls, filename):
        """ Load SQL-based validation rules to db. """
        with create_app().app_context():
            sess = GlobalDB.db().session
            filename = os.path.join(cls.sql_rules_path, filename)

            # Initial load
            sql_data = pd.read_csv(filename, dtype=str, usecols=cls.headers)
            sql_data = clean_data(
                sql_data,
                RuleSql,
                {'rule_label': 'rule_label', 'rule_error_message': 'rule_error_message', 'query_name': 'query_name',
                 'expected_value': 'expected_value', 'category': 'category', 'file_type': 'file_type',
                 'target_file': 'target_file', 'rule_cross_file_flag': 'rule_cross_file_flag',
                 'severity_name': 'severity_name'},
                {}
            )

            # Processing certain values
            sql_data['rule_sql'] = sql_data['query_name'].apply(lambda name: cls.read_sql_str(name))
            sql_data['file_id'] = sql_data['file_type'].apply(lambda type: FILE_TYPE_DICT.get(type, None))
            if sql_data['file_id'].isnull().values.any():
                raise Exception('Invalid file_type value found in sqlLoader. Must be one of the following: {}'
                                .format(', '.join(list(FILE_TYPE_DICT.keys()))))
            sql_data['target_file_id'] = sql_data['target_file'].apply(lambda type: FILE_TYPE_DICT.get(type, None))
            sql_data['rule_cross_file_flag'] = sql_data['rule_cross_file_flag'].apply(lambda flag:
                                                                                      flag in ('true', 't', 'y', 'yes'))
            sql_data['rule_severity_id'] = sql_data['severity_name'].apply(lambda severity_name:
                                                                           RULE_SEVERITY_DICT.get(severity_name, None))
            if sql_data['rule_severity_id'].isnull().values.any():
                raise Exception('Invalid severity_name value found in sqlLoader Must be one of the following: {}'
                                .format(', '.join(list(RULE_SEVERITY_DICT.keys()))))
            sql_data.drop(['file_type', 'severity_name', 'target_file'], axis=1, inplace=True)

            # Final check if we need to actually reload
            if check_dataframe_diff(sql_data, RuleSql, del_cols=['rule_sql_id', 'created_at', 'updated_at'],
                                    sort_cols=['rule_label', 'file_id', 'target_file_id']):
                # Delete and reload all records currently in table
                logger.info('Detected changes in {}, deleting RuleSQL and reloading'.format(cls.sql_rules_path))
                sess.query(RuleSql).delete()
                insert_dataframe(sql_data, RuleSql.__table__.name, sess.connection())
                sess.commit()
            else:
                logger.info('No changes detected since last load. Skipping.')
Exemplo n.º 20
0
def load_object_class(base_path):
    """ This function loads Object classes into the database

        Args:
            base_path: directory that contains the domain values files.
    """
    now = datetime.datetime.now()
    metrics_json = {
        'script_name': 'load_object_class.py',
        'start_time': str(now),
        'records_received': 0,
        'duplicates_dropped': 0,
        'records_deleted': 0,
        'records_inserted': 0
    }

    filename = os.path.join(base_path, 'object_class.csv')
    try:
        # Update file from public S3 bucket
        object_class_url = '{}/object_class.csv'.format(CONFIG_BROKER['usas_public_reference_url'])
        r = requests.get(object_class_url, allow_redirects=True)
        open(filename, 'wb').write(r.content)
    except Exception:
        pass

    # Load object class lookup table
    logger.info('Loading Object Class File: object_class.csv')
    with create_app().app_context():
        sess = GlobalDB.db().session
        metrics_json['records_deleted'] = sess.query(ObjectClass).delete()

        data = pd.read_csv(filename, dtype=str)
        data = clean_data(
            data,
            ObjectClass,
            {"max_oc_code": "object_class_code", "max_object_class_name": "object_class_name"},
            {"object_class_code": {"pad_to_length": 3}}
        )
        metrics_json['records_received'] = len(data.index)
        # de-dupe
        data.drop_duplicates(subset=['object_class_code'], inplace=True)
        metrics_json['duplicates_dropped'] = metrics_json['records_received'] - len(data.index)
        # insert to db
        table_name = ObjectClass.__table__.name
        num = insert_dataframe(data, table_name, sess.connection())
        sess.commit()

    logger.info('{} records inserted to {}'.format(num, table_name))
    metrics_json['records_inserted'] = num

    update_external_data_load_date(now, datetime.datetime.now(), 'object_class')

    metrics_json['duration'] = str(datetime.datetime.now() - now)

    with open('load_object_class_metrics.json', 'w+') as metrics_file:
        json.dump(metrics_json, metrics_file)
Exemplo n.º 21
0
def update_state_congr_table_census(census_file, sess):
    logger.info(
        "Adding congressional districtions from census to the state_congressional table"
    )

    data = pd.read_csv(census_file, dtype=str)
    model = StateCongressional

    data = clean_data(
        data, model, {
            "state_code": "state_code",
            "congressional_district_no": "congressional_district_no",
            "census_year": "census_year"
        }, {'congressional_district_no': {
            "pad_to_length": 2
        }})

    table_name = model.__table__.name
    insert_dataframe(data, table_name, sess.connection())
    sess.commit()
def create_temp_duns_table(sess, table_name, data):
    """ Creates a temporary duns table with the given name and data.

        Args:
            sess: database connection
            table_name: what to name the table being created
            data: pandas dataframe representing duns data
    """
    logger.info('Making {} table'.format(table_name))
    create_table_sql = """
        CREATE TABLE IF NOT EXISTS {} (
            created_at TIMESTAMP WITHOUT TIME ZONE,
            updated_at TIMESTAMP WITHOUT TIME ZONE,
            awardee_or_recipient_uniqu TEXT,
            activation_date DATE,
            expiration_date DATE,
            deactivation_date DATE,
            registration_date DATE,
            last_sam_mod_date DATE,
            legal_business_name TEXT,
            dba_name TEXT,
            ultimate_parent_unique_ide TEXT,
            ultimate_parent_legal_enti TEXT,
            address_line_1 TEXT,
            address_line_2 TEXT,
            city TEXT,
            state TEXT,
            zip TEXT,
            zip4 TEXT,
            country_code TEXT,
            congressional_district TEXT,
            business_types_codes TEXT[],
            business_types TEXT[],
            entity_structure TEXT
        );
    """.format(table_name)
    sess.execute(create_table_sql)
    # Truncating in case we didn't clear out this table after a failure in the script
    sess.execute('TRUNCATE TABLE {};'.format(table_name))
    insert_dataframe(data, table_name, sess.connection())
def parse_city_file(city_file, sess):
    """ Parse the City file and insert all relevant rows into the database.

        Args:
            city_file: path/url to file to gather City data from
            sess: database session
    """
    # read the data and clean up the column names
    data = pd.read_csv(city_file, dtype=str, sep="|")
    data = clean_data(
        data, {
            "FEATURE_NAME": "feature_name",
            "FEATURE_CLASS": "feature_class",
            "CENSUS_CODE": "city_code",
            "STATE_ALPHA": "state_code",
            "COUNTY_NUMERIC": "county_number",
            "COUNTY_NAME": "county_name",
            "PRIMARY_LATITUDE": "latitude",
            "PRIMARY_LONGITUDE": "longitude"
        })

    # add a sort column based on feature_class and remove anything with a different feature class or empty city_code
    feature_class_ranking = {
        "Populated Place": 1,
        "Locale": 2,
        "Civil": 3,
        "Census": 4
    }
    data = data[pd.notnull(data['city_code'])]
    data['sorting_col'] = data['feature_class'].map(feature_class_ranking)
    data = data[pd.notnull(data['sorting_col'])]

    # sort by feature_class then remove any duplicates within state/city code combo (we keep the first occurrence
    # because we've sorted by priority so the one that would overwrite the others is on top already)
    data = data.sort_values(by=['sorting_col'])
    data = data[~data.
                duplicated(subset=['state_code', 'city_code'], keep='first')]
    data = data.drop('sorting_col', axis=1)

    # add created_at and updated_at columns
    now = datetime.utcnow()
    data = data.assign(created_at=now, updated_at=now)

    # just sorting it how it started out
    data = data.sort_values(by=['feature_name'])

    # insert data into table
    num = insert_dataframe(data, CityCode.__table__.name, sess.connection())
    logger.info('{} records inserted to city_code'.format(num))
    sess.commit()
Exemplo n.º 24
0
def parse_fabs_file(f, sess, fips_state_list, state_code_list, sub_tier_list,
                    county_code_list):
    logger.info("starting file " + str(f.name))

    csv_file = 'datafeeds\\' + os.path.splitext(os.path.basename(f.name))[0]
    zfile = zipfile.ZipFile(f.name)
    data = pd.read_csv(
        zfile.open(csv_file),
        dtype=str,
        usecols=[
            'cfda_program_num', 'sai_number', 'recipient_name',
            'recipient_city_code', 'recipient_city_name',
            'recipient_county_code', 'recipient_county_name', 'recipient_zip',
            'recipient_type', 'action_type', 'agency_code', 'federal_award_id',
            'federal_award_mod', 'fed_funding_amount',
            'non_fed_funding_amount', 'total_funding_amount',
            'obligation_action_date', 'starting_date', 'ending_date',
            'assistance_type', 'record_type', 'correction_late_ind',
            'fyq_correction', 'principal_place_code', 'principal_place_state',
            'principal_place_cc', 'principal_place_country_code',
            'principal_place_zip', 'principal_place_cd', 'cfda_program_title',
            'project_description', 'duns_no', 'receip_addr1', 'receip_addr2',
            'receip_addr3', 'face_loan_guran', 'orig_sub_guran',
            'recipient_cd', 'rec_flag', 'recipient_country_code', 'uri',
            'recipient_state_code', 'last_modified_date'
        ])

    clean_data = format_fabs_data(data, sess, fips_state_list, state_code_list,
                                  sub_tier_list, county_code_list)

    if clean_data is not None:
        logger.info("loading {} rows".format(len(clean_data.index)))

        insert_dataframe(clean_data,
                         PublishedAwardFinancialAssistance.__table__.name,
                         sess.connection())
        sess.commit()
def parse_fabs_file(f, sess, fips_state_list, state_code_list, sub_tier_list, county_code_list):
    logger.info("starting file " + str(f.name))

    csv_file = 'datafeeds\\' + os.path.splitext(os.path.basename(f.name))[0]
    zfile = zipfile.ZipFile(f.name)
    data = pd.read_csv(zfile.open(csv_file), dtype=str, usecols=[
        'cfda_program_num', 'sai_number', 'recipient_name', 'recipient_city_code', 'recipient_city_name',
        'recipient_county_code', 'recipient_county_name', 'recipient_zip', 'recipient_type', 'action_type',
        'agency_code', 'federal_award_id', 'federal_award_mod', 'fed_funding_amount', 'non_fed_funding_amount',
        'total_funding_amount', 'obligation_action_date', 'starting_date', 'ending_date', 'assistance_type',
        'record_type', 'correction_late_ind', 'fyq_correction', 'principal_place_code', 'principal_place_state',
        'principal_place_cc', 'principal_place_country_code', 'principal_place_zip', 'principal_place_cd',
        'cfda_program_title', 'project_description', 'duns_no', 'receip_addr1', 'receip_addr2', 'receip_addr3',
        'face_loan_guran', 'orig_sub_guran', 'recipient_cd', 'rec_flag', 'recipient_country_code', 'uri',
        'recipient_state_code', 'last_modified_date'
    ])

    clean_data = format_fabs_data(data, sess, fips_state_list, state_code_list, sub_tier_list, county_code_list)

    if clean_data is not None:
        logger.info("loading {} rows".format(len(clean_data.index)))

        insert_dataframe(clean_data, PublishedAwardFinancialAssistance.__table__.name, sess.connection())
        sess.commit()
def update_state_congr_table_census(census_file, sess):
    """ Update contents of state_congressional table to include districts from the census

        Args:
            census_file: file path/url to the census file to read
            sess: the database connection
    """
    logger.info("Adding congressional districts from census to the state_congressional table")

    data = pd.read_csv(census_file, dtype=str)
    model = StateCongressional

    data = clean_data(
        data,
        model,
        {"state_code": "state_code",
         "congressional_district_no": "congressional_district_no",
         "census_year": "census_year"},
        {'congressional_district_no': {"pad_to_length": 2}}
    )

    table_name = model.__table__.name
    insert_dataframe(data, table_name, sess.connection())
    sess.commit()
Exemplo n.º 27
0
def load_defc(force_reload=False):
    """ Loads the DEFC data.

        Args:
            force_reload: boolean to determine if reload should happen whether there are differences or not
    """
    start_time = datetime.now()
    defc_file = os.path.join(CONFIG_BROKER['path'], 'dataactvalidator',
                             'config', 'def_codes.csv')

    try:
        # Update file from public S3 bucket
        def_codes_url = '{}/def_codes.csv'.format(
            CONFIG_BROKER['usas_public_reference_url'])
        r = requests.get(def_codes_url, allow_redirects=True)
        open(defc_file, 'wb').write(r.content)
    except Exception:
        pass

    logger.info('Loading defc data')
    with create_app().app_context():
        data = pd.read_csv(defc_file, dtype=str)

        # Remove all invalid DEFCs that have been left in the file so USAS can continue to display them correctly
        data = data[data['Is Valid'] == 'true']

        data = clean_data(data, DEFC, {
            'defc': 'code',
            'group_name': 'group'
        }, {})

        diff_found = check_dataframe_diff(data, DEFC, ['defc_id'], ['code'])

        if force_reload or diff_found:
            sess = GlobalDB.db().session
            # delete any data in the DEFC table
            sess.query(DEFC).delete()

            # insert data into table
            num = insert_dataframe(data, DEFC.__table__.name,
                                   sess.connection())
            logger.info('{} records inserted to defc'.format(num))
            sess.commit()
            update_external_data_load_date(start_time, datetime.now(), 'defc')
        else:
            logger.info('No differences found, skipping defc table reload.')
Exemplo n.º 28
0
def load_submission_window_schedule():
    """ Loads the submission window schedule data. """
    if CONFIG_BROKER["use_aws"]:
        s3_client = boto3.client('s3', region_name=CONFIG_BROKER['aws_region'])
        sub_schedule_file = s3_client.generate_presigned_url(
            'get_object', {
                'Bucket': CONFIG_BROKER['sf_133_bucket'],
                'Key': "submission_window_schedule.csv"
            },
            ExpiresIn=600)
    else:
        sub_schedule_file = os.path.join(CONFIG_BROKER['path'],
                                         'dataactvalidator', 'config',
                                         'submission_window_schedule.csv')

    logger.info('Loading submission window schedule data')
    with create_app().app_context():
        data = pd.read_csv(sub_schedule_file, dtype=str)

        data = clean_data(
            data, SubmissionWindowSchedule, {
                'year': 'year',
                'period': 'period',
                'period_start': 'period_start',
                'publish_deadline': 'publish_deadline',
                'certification_deadline': 'certification_deadline'
            }, {})

        # Add a day to the deadlines because the dates in the file are supposed to be inclusive
        data['publish_deadline'] = data.apply(
            lambda x: add_day(x, 'publish_deadline'), axis=1)
        data['certification_deadline'] = data.apply(
            lambda x: add_day(x, 'certification_deadline'), axis=1)

        sess = GlobalDB.db().session
        # delete any data in the SubmissionWindowSchedule table
        sess.query(SubmissionWindowSchedule).delete()

        # insert data into table
        num = insert_dataframe(data, SubmissionWindowSchedule.__table__.name,
                               sess.connection())
        logger.info(
            '{} records inserted to submission_window_schedule'.format(num))
        sess.commit()
def parse_city_file(city_file, sess):
    """ Parse the City file and insert all relevant rows into the database.

        Args:
            city_file: path/url to file to gather City data from
            sess: database session
    """
    # read the data and clean up the column names
    data = pd.read_csv(city_file, dtype=str, sep="|")
    data = clean_data(
        data,
        {"FEATURE_NAME": "feature_name",
         "FEATURE_CLASS": "feature_class",
         "CENSUS_CODE": "city_code",
         "STATE_ALPHA": "state_code",
         "COUNTY_NUMERIC": "county_number",
         "COUNTY_NAME": "county_name",
         "PRIMARY_LATITUDE": "latitude",
         "PRIMARY_LONGITUDE": "longitude"})

    # add a sort column based on feature_class and remove anything with a different feature class or empty city_code
    feature_class_ranking = {"Populated Place": 1, "Locale": 2, "Civil": 3, "Census": 4}
    data = data[pd.notnull(data['city_code'])]
    data['sorting_col'] = data['feature_class'].map(feature_class_ranking)
    data = data[pd.notnull(data['sorting_col'])]

    # sort by feature_class then remove any duplicates within state/city code combo (we keep the first occurrence
    # because we've sorted by priority so the one that would overwrite the others is on top already)
    data = data.sort_values(by=['sorting_col'])
    data = data[~data.duplicated(subset=['state_code', 'city_code'], keep='first')]
    data = data.drop('sorting_col', axis=1)

    # add created_at and updated_at columns
    now = datetime.utcnow()
    data = data.assign(created_at=now, updated_at=now)

    # just sorting it how it started out
    data = data.sort_values(by=['feature_name'])

    # insert data into table
    num = insert_dataframe(data, CityCode.__table__.name, sess.connection())
    logger.info('{} records inserted to city_code'.format(num))
    sess.commit()
def parse_state_file(state_file, sess):
    """ Parse the State file and insert all relevant rows into the database.

        Args:
            state_file: path/url to file to gather State data from
            sess: database session
    """
    # read the data. Cleaning is in there in case something changes, doesn't really do anything now
    data = pd.read_csv(state_file, dtype=str)
    data = clean_data(
        data,
        {"state_name": "state_name",
         "state_code": "state_code",
         "fips_code": "fips_code"})

    # add created_at and updated_at columns
    now = datetime.utcnow()
    data = data.assign(created_at=now, updated_at=now)

    # insert data into table
    num = insert_dataframe(data, States.__table__.name, sess.connection())
    logger.info('{} records inserted to states'.format(num))
    sess.commit()
def load_country_codes(base_path):
    """ Load Country Codes into the database.

        Args:
            base_path: directory that contains the domain values files.
    """

    if CONFIG_BROKER["use_aws"]:
        s3_client = boto3.client('s3', region_name=CONFIG_BROKER['aws_region'])
        filename = s3_client.generate_presigned_url('get_object', {'Bucket': CONFIG_BROKER['sf_133_bucket'],
                                                                   'Key': "country_codes.csv"}, ExpiresIn=600)
    else:
        filename = os.path.join(base_path, "country_codes.csv")

    logger.info('Loading country codes file: country_codes.csv')

    with create_app().app_context():
        sess = GlobalDB.db().session
        # for object class, delete and replace values
        sess.query(CountryCode).delete()

        data = pd.read_csv(filename, dtype=str)
        data = clean_data(
            data,
            CountryCode,
            {"country_code": "country_code", "country_name": "country_name"},
            {}
        )
        # de-dupe
        data.drop_duplicates(subset=['country_code'], inplace=True)
        # insert to db
        table_name = CountryCode.__table__.name
        num = insert_dataframe(data, table_name, sess.connection())
        sess.commit()

    logger.info('{} records inserted to {}'.format(num, table_name))
def parse_state_file(state_file, sess):
    """ Parse the State file and insert all relevant rows into the database.

        Args:
            state_file: path/url to file to gather State data from
            sess: database session
    """
    # read the data. Cleaning is in there in case something changes, doesn't really do anything now
    data = pd.read_csv(state_file, dtype=str)
    data = clean_data(
        data, {
            "state_name": "state_name",
            "state_code": "state_code",
            "fips_code": "fips_code"
        })

    # add created_at and updated_at columns
    now = datetime.utcnow()
    data = data.assign(created_at=now, updated_at=now)

    # insert data into table
    num = insert_dataframe(data, States.__table__.name, sess.connection())
    logger.info('{} records inserted to states'.format(num))
    sess.commit()
def process_file_chunk(sess, data, certified_table, job, submission_id,
                       file_type_id, rename_cols, col_mapping, all_cols,
                       row_offset, float_cols):
    """ Load in a chunk of award data from updated submissions

        Args:
            sess: the database connection
            data: the chunked dataframe
            certified_table: the certified table to copy to
            job: the certified validation job associated with the file type
            submission_id: the submission associated with the file
            file_type_id: the file type id associated with the file
            rename_cols: mapping of columns that have been renamed over time
            col_mapping: mapping of either daims name or long name to the short names
            all_cols: all the schema columns and deleted columns over time
            row_offset: with the chunking, indicates the row starting point in the file
            float_cols: columns that are floats (to remove the commas)

        Returns:
            updated row_offset to be reused
    """

    # Only use the columns needed for the DB table
    if data.empty:
        logger.info('Empty file for submission {}, {} file. Skipping'.format(
            submission_id, FILE_TYPE_DICT_ID[file_type_id]))
        return

    # Renaming columns to short db names regardless of how old the files are
    data = data.rename(columns=lambda x: x.lower().strip())
    data = data.rename(index=str, columns=rename_cols)
    data = data.rename(index=str, columns=col_mapping)
    # If the file is missing new columns added over time, just set them to None
    blank_cols = list(set(all_cols) - set(list(data.columns)))
    logger.info('The following fields were not found in this chunk: {}'.format(
        blank_cols))
    data = data.reindex(columns=list(data.columns) + blank_cols)
    # Keep only what we need from the schema + any deleted columns
    data = data[[col for col in all_cols if col in data.columns]]

    # Clean rows
    if len(data.index) > 0:
        data = data.applymap(clean_col)
        for field in [col for col in list(data.columns) if col in float_cols]:
            data[field] = data[field].apply(lambda x: x.replace(',', '')
                                            if x else None)

    # Populate columns that aren't in the file
    now = datetime.datetime.now()
    data['created_at'] = now
    data['updated_at'] = now
    data['submission_id'] = submission_id
    data['job_id'] = job.job_id

    data = data.reset_index()
    original_row_offset = row_offset
    data['row_number'] = row_offset + data.index + 2
    row_offset += CHUNK_SIZE

    data = data.drop(['index'], axis=1)

    logger.info(
        'Moving chunk data for submission {}, {} file, starting from row {}'.
        format(submission_id, FILE_TYPE_DICT_ID[file_type_id],
               original_row_offset + 2))

    # Process and insert the data
    insert_dataframe(data, certified_table.__table__.name, sess.connection())
    sess.commit()

    return row_offset
def parse_sam_file(file_path, sess, monthly=False, benchmarks=False, table=DUNS, year=None):
    """ Takes in a SAM file and adds the DUNS data to the database

        Args:
            file_path: the path to the SAM file
            sess: the database connection
            monthly: whether it's a monthly file
            benchmarks: whether to log times
            table: the table to work from (could be DUNS/HistoricParentDuns)
            year: the year associated with the data (primarily for  HistoricParentDUNS loads)
    """
    parse_start_time = time.time()
    logger.info("Starting file " + str(file_path))

    dat_file_name = os.path.splitext(os.path.basename(file_path))[0]+'.dat'
    sam_file_type = "MONTHLY" if monthly else "DAILY"
    dat_file_date = re.findall(".*{}_(.*).dat".format(sam_file_type), dat_file_name)[0]

    with create_app().app_context():

        column_header_mapping = {
            "awardee_or_recipient_uniqu": 0,
            "sam_extract_code": 4,
            "registration_date": 6,
            "expiration_date": 7,
            "last_sam_mod_date": 8,
            "activation_date": 9,
            "legal_business_name": 10,
            "dba_name": 11,
            "address_line_1": 14,
            "address_line_2": 15,
            "city": 16,
            "state": 17,
            "zip": 18,
            "zip4": 19,
            "country_code": 20,
            "congressional_district": 21,
            "entity_structure": 27,
            "business_types_raw": 31,
            "ultimate_parent_legal_enti": 186,
            "ultimate_parent_unique_ide": 187
        }
        column_header_mapping_ordered = OrderedDict(sorted(column_header_mapping.items(), key=lambda c: c[1]))

        # Initial sweep of the file to see rows and possibly what DUNS we're updating
        if benchmarks:
            initial_sweep = time.time()
        nrows = 0
        with zipfile.ZipFile(file_path) as zip_file:
            with zip_file.open(dat_file_name) as dat_file:
                nrows = len(dat_file.readlines())
        if benchmarks:
            logger.info("Initial sweep took {} seconds".format(time.time() - initial_sweep))

        block_size = 10000
        batches = (nrows-1)//block_size
        # skip the first line again if the last batch is also the first batch
        skiplastrows = 2 if batches == 0 else 1
        last_block_size = ((nrows % block_size) or block_size)-skiplastrows
        batch = 0
        added_rows = 0
        while batch <= batches:
            skiprows = 1 if batch == 0 else (batch*block_size)
            nrows = (((batch+1)*block_size)-skiprows) if (batch < batches) else last_block_size
            logger.info('Loading rows %s to %s', skiprows+1, nrows+skiprows)

            with zipfile.ZipFile(file_path) as zip_file:
                with zip_file.open(dat_file_name) as dat_file:
                    csv_data = pd.read_csv(dat_file, dtype=str, header=None, skiprows=skiprows, nrows=nrows, sep='|',
                                           usecols=column_header_mapping_ordered.values(),
                                           names=column_header_mapping_ordered.keys(), quoting=3)

                    # add deactivation_date column for delete records
                    lambda_func = (lambda sam_extract: pd.Series([dat_file_date if sam_extract == "1" else np.nan]))
                    csv_data = csv_data.assign(deactivation_date=pd.Series([np.nan], name='deactivation_date')
                                               if monthly else csv_data["sam_extract_code"].apply(lambda_func))
                    # convert business types string to array
                    bt_func = (lambda bt_raw: pd.Series([[str(code) for code in str(bt_raw).split('~')
                                                          if isinstance(bt_raw, str)]]))
                    csv_data = csv_data.assign(business_types_codes=csv_data["business_types_raw"].apply(bt_func))
                    del csv_data["business_types_raw"]
                    # removing rows where DUNS number isn't even provided
                    csv_data = csv_data.where(csv_data["awardee_or_recipient_uniqu"].notnull())
                    # cleaning and replacing NaN/NaT with None's
                    csv_data = clean_sam_data(csv_data.where(pd.notnull(csv_data), None), table=table)

                    if monthly:
                        logger.info("Adding all monthly data with bulk load")
                        if benchmarks:
                            bulk_month_load = time.time()
                        del csv_data["sam_extract_code"]
                        if year:
                            csv_data['year'] = year
                        insert_dataframe(csv_data, table.__table__.name, sess.connection())
                        if benchmarks:
                            logger.info("Bulk month load took {} seconds".format(time.time()-bulk_month_load))
                    else:
                        add_data = csv_data[csv_data.sam_extract_code == '2']
                        update_delete_data = csv_data[(csv_data.sam_extract_code == '3') |
                                                      (csv_data.sam_extract_code == '1')]
                        for dataframe in [add_data, update_delete_data]:
                            del dataframe["sam_extract_code"]

                        if not add_data.empty:
                            try:
                                logger.info("Attempting to bulk load add data")
                                insert_dataframe(add_data, table.__table__.name, sess.connection())
                            except IntegrityError:
                                logger.info("Bulk loading add data failed, loading add data by row")
                                sess.rollback()
                                models, activated_models = get_relevant_models(add_data, sess, benchmarks=benchmarks)
                                logger.info("Loading add data ({} rows)".format(len(add_data.index)))
                                load_duns_by_row(add_data, sess, models, activated_models, benchmarks=benchmarks,
                                                 table=table)
                        if not update_delete_data.empty:
                            models, activated_models = get_relevant_models(update_delete_data, sess,
                                                                           benchmarks=benchmarks)
                            logger.info("Loading update_delete data ({} rows)".format(len(update_delete_data.index)))
                            load_duns_by_row(update_delete_data, sess, models, activated_models, benchmarks=benchmarks,
                                             table=table)
                    sess.commit()

            added_rows += nrows
            batch += 1
            logger.info('%s DUNS records inserted', added_rows)
        if benchmarks:
            logger.info("Parsing {} took {} seconds with {} rows".format(dat_file_name, time.time()-parse_start_time,
                                                                         added_rows))
def load_cfda_program(base_path, load_local=False, local_file_name="cfda_program.csv"):
    """ Load cfda program.

        Args:
            base_path: directory that contains the cfda values files.
    """
    if not load_local:
        logger.info("Fetching CFDA file from {}".format(S3_CFDA_FILE))
        tmp_name = str(time.time()).replace(".", "") + "_cfda_program.csv"
        filename = os.path.join(base_path, tmp_name)
        r = requests.get(S3_CFDA_FILE, allow_redirects=True)
        open(filename, 'wb').write(r.content)
    else:
        filename = os.path.join(base_path, local_file_name)
    logger.info('Loading CFDA program file: ' + filename)
    """Load country code lookup table."""
    model = CFDAProgram

    def fix_program_number(n, decimals=3):
        multiplier = 10 ** decimals
        value = math.floor(n * multiplier + 0.5) / multiplier
        return str(value).ljust(6, '0')

    with create_app().app_context():
        configure_logging()
        sess = GlobalDB.db().session

        now = datetime.utcnow()
        import_data = pd.read_csv(filename, dtype=str, encoding='cp1252', na_filter=False)
        import_data = clean_data(
            import_data,
            model,
            DATA_CLEANING_MAP,
            {}
        )
        import_data["published_date"] = format_date(import_data["published_date"])
        import_data["archived_date"] = format_date(import_data["archived_date"])
        import_dataframe = import_data.copy(deep=True)
        # To do the comparison, first we need to mock the pk column that postgres creates. We'll set it universally to 1
        import_dataframe = import_dataframe.assign(cfda_program_id=1, created_at=now, updated_at=now)

        table_name = model.__table__.name
        current_data = pd.read_sql_table(table_name, sess.connection(), coerce_float=False)
        # Now we need to overwrite the db's audit dates in the created dataframe, and
        # also set all the  pks to 1, so they match
        current_data = current_data.assign(cfda_program_id=1, created_at=now, updated_at=now)
        # pandas comparison requires everything to be in the same order
        current_data.sort_values('program_number', inplace=True)
        import_dataframe.sort_values('program_number', inplace=True)

        # columns too
        cols = import_dataframe.columns.tolist()
        cols.sort()
        import_dataframe = import_dataframe[cols]

        cols = current_data.columns.tolist()
        cols.sort()
        current_data = current_data[cols]

        # need to reset the indexes now that we've done all this sorting, so that they match
        import_dataframe.reset_index(drop=True, inplace=True)
        current_data.reset_index(drop=True, inplace=True)
        # My favorite part: When pandas pulls the data out of postgres, the program_number column
        # is a Decimal. However, in adding it to the dataframe, this column loses precision.
        # So for example, a program number  of 10.001 imports into the dataframe as 10.000999999999999.
        # It also needs to be cast to astring, and padded with the right number of zeroes, as needed.
        current_data['program_number'] = current_data['program_number'].apply(lambda x: fix_program_number(x))
        # Finally, you can execute this and get True back if the data truly has not changed from the last
        # time the CSV was loaded.
        new_data = not import_dataframe.equals(current_data)
        if new_data:
            # insert to db
            sess.query(model).delete()
            num = insert_dataframe(import_data, table_name, sess.connection())
            sess.commit()
    if not load_local:
        os.remove(filename)
    if new_data:
        logger.info('{} records inserted to {}'.format(num, table_name))
    else:
        logger.info("Skipped cfda load, no new data.")
        sys.exit(3)
def load_program_activity_data(base_path, force_reload=False, export=False):
    """ Load program activity lookup table.

        Args:
            base_path: directory of domain config files
            force_reload: whether or not to force a reload
            export: whether or not to export a public copy of the file
    """
    now = datetime.datetime.now()
    metrics_json = {
        'script_name': 'load_program_activity.py',
        'start_time': str(now),
        'records_received': 0,
        'duplicates_dropped': 0,
        'invalid_records_dropped': 0,
        'records_deleted': 0,
        'records_inserted': 0
    }
    dropped_count = 0

    logger.info('Checking PA upload dates to see if we can skip.')
    last_upload = get_date_of_current_pa_upload(base_path)
    if not (last_upload > get_stored_pa_last_upload()) and not force_reload:
        logger.info('Skipping load as it\'s already been done')
    else:
        logger.info('Getting the progrma activity file')
        program_activity_file = get_program_activity_file(base_path)

        logger.info('Loading program activity: {}'.format(PA_FILE_NAME))

        with create_app().app_context():
            sess = GlobalDB.db().session
            try:
                raw_data = pd.read_csv(program_activity_file, dtype=str)
            except pd.io.common.EmptyDataError:
                log_blank_file()
                exit_if_nonlocal(4)  # exit code chosen arbitrarily, to indicate distinct failure states
                return
            headers = set([header.upper() for header in list(raw_data)])

            if not VALID_HEADERS.issubset(headers):
                logger.error('Missing required headers. Required headers include: %s' % str(VALID_HEADERS))
                exit_if_nonlocal(4)
                return

            try:
                dropped_count, data = clean_data(
                    raw_data,
                    ProgramActivity,
                    {'fyq': 'fiscal_year_period', 'agency_code': 'agency_id', 'allocation_id': 'allocation_transfer_id',
                     'account_code': 'account_number', 'pa_code': 'program_activity_code',
                     'pa_title': 'program_activity_name'},
                    {'program_activity_code': {'pad_to_length': 4}, 'agency_id': {'pad_to_length': 3},
                     'allocation_transfer_id': {'pad_to_length': 3, 'keep_null': True},
                     'account_number': {'pad_to_length': 4}},
                    ['agency_id', 'program_activity_code', 'account_number', 'program_activity_name'],
                    True
                )
            except FailureThresholdExceededException as e:
                if e.count == 0:
                    log_blank_file()
                    exit_if_nonlocal(4)
                    return
                else:
                    logger.error('Loading of program activity file failed due to exceeded failure threshold. '
                                 'Application tried to drop {} rows'.format(e.count))
                    exit_if_nonlocal(5)
                    return

            metrics_json['records_deleted'] = sess.query(ProgramActivity).delete()
            metrics_json['invalid_records_dropped'] = dropped_count

            # Lowercase Program Activity Name
            data['program_activity_name'] = data['program_activity_name'].apply(lambda x: lowercase_or_notify(x))
            # Convert FYQ to FYP
            data['fiscal_year_period'] = data['fiscal_year_period'].apply(lambda x: convert_fyq_to_fyp(x))

            # because we're only loading a subset of program activity info, there will be duplicate records in the
            # dataframe. this is ok, but need to de-duped before the db load. We also need to log them.
            base_count = len(data.index)
            metrics_json['records_received'] = base_count
            data.drop_duplicates(inplace=True)

            dupe_count = base_count - len(data.index)
            logger.info('Dropped {} duplicate rows.'.format(dupe_count))
            metrics_json['duplicates_dropped'] = dupe_count

            # insert to db
            table_name = ProgramActivity.__table__.name
            num = insert_dataframe(data, table_name, sess.connection())
            sess.commit()

            if export:
                export_public_pa(raw_data)

        end_time = datetime.datetime.now()
        update_external_data_load_date(now, end_time, 'program_activity')
        update_external_data_load_date(last_upload, end_time, 'program_activity_upload')
        logger.info('{} records inserted to {}'.format(num, table_name))
        metrics_json['records_inserted'] = num

        metrics_json['duration'] = str(end_time - now)

    with open('load_program_activity_metrics.json', 'w+') as metrics_file:
        json.dump(metrics_json, metrics_file)

    if dropped_count > 0:
        exit_if_nonlocal(3)
        return
Exemplo n.º 37
0
def load_cfda_program(base_path, load_local=False, local_file_name="cfda_program.csv"):
    """ Load cfda program.

        Args:
            base_path: directory that contains the cfda values files.
            load_local: boolean indicating whether to load from a local file or not
            local_file_name: the name of the file if loading locally
    """
    local_now = datetime.now()
    if not load_local:
        logger.info("Fetching CFDA file from {}".format(S3_CFDA_FILE))
        tmp_name = str(time.time()).replace(".", "") + "_cfda_program.csv"
        filename = os.path.join(base_path, tmp_name)
        r = requests.get(S3_CFDA_FILE, allow_redirects=True)
        open(filename, 'wb').write(r.content)
    else:
        filename = os.path.join(base_path, local_file_name)
    logger.info('Loading CFDA program file: ' + filename)
    model = CFDAProgram

    metrics_json = {
        'script_name': 'load_cfda_data.py',
        'start_time': str(local_now),
        'new_records': 0
    }

    def fix_program_number(row, decimals=3):
        multiplier = 10 ** decimals
        value = math.floor(row['program_number'] * multiplier + 0.5) / multiplier
        return str(value).ljust(6, '0')

    with create_app().app_context():
        configure_logging()
        sess = GlobalDB.db().session

        import_data = pd.read_csv(filename, dtype=str, encoding='cp1252', na_filter=False)
        import_data = clean_data(
            import_data,
            model,
            DATA_CLEANING_MAP,
            {}
        )
        import_data["published_date"] = format_date(import_data["published_date"])
        import_data["archived_date"] = format_date(import_data["archived_date"])
        table_name = model.__table__.name
        # Check if there is new data to load
        new_data = check_dataframe_diff(import_data, model, ['cfda_program_id'], ['program_number'],
                                        lambda_funcs=[('program_number', fix_program_number)])
        if new_data:
            # insert to db
            sess.query(model).delete()
            num = insert_dataframe(import_data, table_name, sess.connection())
            sess.commit()

            # If we've updated the data at all, update the external data load date
            update_external_data_load_date(local_now, datetime.now(), 'cfda')
    if not load_local:
        os.remove(filename)
    if new_data:
        logger.info('{} records inserted to {}'.format(num, table_name))
        metrics_json['new_records'] = num
    else:
        logger.info("Skipped cfda load, no new data.")
        sys.exit(3)

    metrics_json['duration'] = str(datetime.now() - local_now)

    with open('load_cfda_data_metrics.json', 'w+') as metrics_file:
        json.dump(metrics_json, metrics_file)
def parse_sam_file(file, sess):
    logger.info("starting file " + str(file.name))

    csv_file = os.path.splitext(os.path.basename(file.name))[0] + '.dat'
    zfile = zipfile.ZipFile(file.name)

    # can't use skipfooter, pandas' c engine doesn't work with skipfooter and the python engine doesn't work with dtype
    nrows = 0
    with zfile.open(csv_file) as f:
        nrows = len(f.readlines()) - 2  # subtract the header and footer
    column_header_mapping = {
        "awardee_or_recipient_uniqu": 0,
        "sam_extract": 4,
        "expiration_date": 7,
        "activation_date": 9,
        "ultimate_parent_legal_enti": 10,
        "ultimate_parent_unique_ide": 48,
        "exec_comp_str": 89
    }
    column_header_mapping_ordered = OrderedDict(
        sorted(column_header_mapping.items(), key=lambda c: c[1]))
    csv_data = pd.read_csv(zfile.open(csv_file),
                           dtype=str,
                           header=None,
                           skiprows=1,
                           nrows=nrows,
                           sep='|',
                           usecols=column_header_mapping_ordered.values(),
                           names=column_header_mapping_ordered.keys())
    total_data = csv_data.copy()

    # skipping when sam_extract == '4' as it's expired
    total_data = total_data[total_data.sam_extract != '4']

    # parse out executive compensation from row 90
    lambda_func = (lambda ecs: pd.Series(list(parse_exec_comp(ecs).values())))
    parsed_data = total_data["exec_comp_str"].apply(lambda_func)
    parsed_data.columns = list(parse_exec_comp().keys())
    del total_data["exec_comp_str"]
    total_data = total_data.join(parsed_data)

    # split into 3 dataframes based on row 8 ('1', '2', '3')
    delete_data = total_data[total_data.sam_extract == '1'].replace(np.nan,
                                                                    "",
                                                                    regex=True)
    add_data = total_data[total_data.sam_extract == '2'].replace(np.nan,
                                                                 "",
                                                                 regex=True)
    update_data = total_data[total_data.sam_extract == '3'].replace(np.nan,
                                                                    "",
                                                                    regex=True)
    for dataframe in [add_data, update_data, delete_data, total_data]:
        del dataframe["sam_extract"]

    table_name = ExecutiveCompensation.__table__.name
    insert_dataframe(add_data, table_name, sess.connection())
    for _, row in update_data.iterrows():
        sess.query(ExecutiveCompensation).filter_by(awardee_or_recipient_uniqu=row['awardee_or_recipient_uniqu']).\
            update(row, synchronize_session=False)
    for _, row in delete_data.iterrows():
        sess.query(ExecutiveCompensation).filter_by(awardee_or_recipient_uniqu=row['awardee_or_recipient_uniqu']).\
            delete(synchronize_session=False)
    sess.commit()