def load_location_data(): if CONFIG_BROKER["use_aws"]: s3_client = boto3.client('s3', region_name=CONFIG_BROKER['aws_region']) city_file = s3_client.generate_presigned_url('get_object', {'Bucket': CONFIG_BROKER['sf_133_bucket'], 'Key': "NationalFedCodes.txt"}, ExpiresIn=600) county_file = s3_client.generate_presigned_url('get_object', {'Bucket': CONFIG_BROKER['sf_133_bucket'], 'Key': "GOVT_UNITS.txt"}, ExpiresIn=600) state_file = s3_client.generate_presigned_url('get_object', {'Bucket': CONFIG_BROKER['sf_133_bucket'], 'Key': "state_list.txt"}, ExpiresIn=600) citystate_file = s3_client.generate_presigned_url('get_object', {'Bucket': CONFIG_BROKER['sf_133_bucket'], 'Key': "ctystate.txt"}, ExpiresIn=600) zip_city_file = urllib.request.urlopen(citystate_file) else: city_file = os.path.join(CONFIG_BROKER["path"], "dataactvalidator", "config", "NationalFedCodes.txt") county_file = os.path.join(CONFIG_BROKER["path"], "dataactvalidator", "config", "GOVT_UNITS.txt") state_file = os.path.join(CONFIG_BROKER["path"], "dataactvalidator", "config", "state_list.txt") citystate_file = os.path.join(CONFIG_BROKER["path"], "dataactvalidator", "config", "ctystate.txt") zip_city_file = open(citystate_file) with create_app().app_context(): logger.info('Loading city data') load_city_data(city_file) logger.info('Loading county data') load_county_data(county_file) logger.info('Loading state data') load_state_data(state_file) logger.info('Loading zip city data') load_zip_city_data(zip_city_file)
def load_cgac(file_name): """Load CGAC (high-level agency names) lookup table.""" with create_app().app_context(): sess = GlobalDB.db().session models = {cgac.cgac_code: cgac for cgac in sess.query(CGAC)} # read CGAC values from csv data = pd.read_csv(file_name, dtype=str) # clean data data = clean_data( data, CGAC, {"cgac_agency_code": "cgac_code", "agency_name": "agency_name", "agency_abbreviation": "agency_abbreviation"}, {"cgac_code": {"pad_to_length": 3}} ) # de-dupe data.drop_duplicates(subset=['cgac_code'], inplace=True) delete_missing_cgacs(models, data) update_cgacs(models, data) sess.add_all(models.values()) sess.commit() logger.info('%s CGAC records inserted', len(models))
def setup_emails(): """Create email templates from model metadata.""" with create_app().app_context(): sess = GlobalDB.db().session # insert email template types type_list = [ ('review_submission', '') ] for t in type_list: email_id = sess.query( EmailTemplateType.email_template_type_id).filter( EmailTemplateType.name == t[0]).one_or_none() if not email_id: email_type = EmailTemplateType(name=t[0], description=t[1]) sess.add(email_type) sess.commit() # insert email templates # Submission Review template = ("[REV_USER_NAME] has shared a DATA Act broker submission with you from [REV_AGENCY]. Click " "<a href='[REV_URL]'>here</a> to review their submission. For questions or comments, please visit " "the Service Desk at https://servicedesk.usaspending.gov/ or e-mail [email protected].") load_email_template(sess, "DATA Act Broker - Submission Ready for Review", template, "review_submission")
def load_frec(file_name): """Load FREC (high-level agency names) lookup table.""" with create_app().app_context(): sess = GlobalDB.db().session models = {frec.frec_code: frec for frec in sess.query(FREC)} # read FREC values from csv data = pd.read_csv(file_name, dtype=str) # clean data data = clean_data( data, FREC, {"frec": "frec_code", "cgac_agency_code": "cgac_code", "frec_entity_description": "agency_name", "agency_abbreviation": "agency_abbreviation"}, {"frec": {"keep_null": False}, "cgac_code": {"pad_to_length": 3}, "frec_code": {"pad_to_length": 4}} ) # de-dupe data.drop_duplicates(subset=['frec_code'], inplace=True) # create foreign key dicts cgac_dict = {str(cgac.cgac_code): cgac.cgac_id for cgac in sess.query(CGAC).filter(CGAC.cgac_code.in_(data["cgac_code"])).all()} # insert to db delete_missing_frecs(models, data) update_frecs(models, data, cgac_dict) sess.add_all(models.values()) sess.commit() logger.info('%s FREC records inserted', len(models))
def read_zips(): with create_app().app_context(): sess = GlobalDB.db().session # delete old values in case something changed and one is now invalid sess.query(Zips).delete(synchronize_session=False) sess.commit() if CONFIG_BROKER["use_aws"]: s3connection = boto.s3.connect_to_region(CONFIG_BROKER['aws_region']) s3bucket = s3connection.lookup(CONFIG_BROKER['sf_133_bucket']) zip_folder = CONFIG_BROKER["zip_folder"] + "/" for key in s3bucket.list(prefix=zip_folder): if key.name != zip_folder: zip_4_file_path = key.generate_url(expires_in=600) parse_zip4_file(urllib.request.urlopen(zip_4_file_path), sess) # parse remaining 5 digit zips that weren't in the first file citystate_file = s3bucket.get_key("ctystate.txt").generate_url(expires_in=600) parse_citystate_file(urllib.request.urlopen(citystate_file), sess) else: base_path = os.path.join(CONFIG_BROKER["path"], "dataactvalidator", "config", CONFIG_BROKER["zip_folder"]) # creating the list while ignoring hidden files on mac file_list = [f for f in os.listdir(base_path) if not re.match('^\.', f)] for file in file_list: parse_zip4_file(open(os.path.join(base_path, file)), sess) # parse remaining 5 digit zips that weren't in the first file citystate_file = os.path.join(CONFIG_BROKER["path"], "dataactvalidator", "config", "ctystate.txt") parse_citystate_file(open(citystate_file), sess) logger.info("Zipcode script complete")
def load_county_data(county_file): with create_app().app_context(): sess = GlobalDB.db().session # delete any data in the CityCode table sess.query(CountyCode).delete() # parse the new county code data parse_county_file(county_file, sess)
def load_state_data(state_file): with create_app().app_context(): sess = GlobalDB.db().session # delete any data in the States table sess.query(States).delete() # parse the new state data parse_state_file(state_file, sess)
def load_zip_city_data(zip_city_file): with create_app().app_context(): sess = GlobalDB.db().session # delete any data in the ZipCity table sess.query(ZipCity).delete() # parse the new zip city data parse_zip_city_file(zip_city_file, sess)
def reset_alembic(alembic_version): with create_app().app_context(): db = GlobalDB.db() engine = db.engine sess = db.session metadata = MetaData(bind=engine) alembic_table = Table('alembic_version', metadata, autoload=True) u = update(alembic_table) u = u.values({"version_num": alembic_version}) sess.execute(u) sess.commit()
def load_offices(): """ Load FPDS Contracting Office file into broker database. """ # read office file to dataframe, to make sure all is well with the file before firing up a db transaction if CONFIG_BROKER["use_aws"]: s3_client = boto3.client('s3', region_name=CONFIG_BROKER['aws_region']) load_office = s3_client.generate_presigned_url('get_object', {'Bucket': CONFIG_BROKER['sf_133_bucket'], 'Key': "FPDSNG_Contracting_Offices.csv"}, ExpiresIn=600) else: load_office = os.path.join(CONFIG_BROKER["path"], "dataactvalidator", "config", "FPDSNG_Contracting_Offices.csv") with create_app().app_context(): update_offices(load_office)
def create_admin(): """Create initial admin user.""" logger.info('Creating admin user') admin_email = CONFIG_BROKER['admin_email'] admin_pass = CONFIG_BROKER['admin_password'] with create_app().app_context(): sess = GlobalDB.db().session user = sess.query(User).filter(User.email == admin_email).one_or_none() if not user: # once the rest of the setup scripts are updated to use # GlobalDB instead of databaseSession, move the app_context # creation up to initialize() user = create_user_with_password(admin_email, admin_pass, Bcrypt(), website_admin=True) return user
def read_zips(): """ Update zip codes in the zips table. """ with create_app().app_context(): sess = GlobalDB.db().session # Create temporary table to do work in so we don't disrupt the site for too long by altering the actual table sess.execute('CREATE TABLE IF NOT EXISTS temp_zips (LIKE zips INCLUDING ALL);') # Truncating in case we didn't clear out this table after a failure in the script sess.execute('TRUNCATE TABLE temp_zips;') sess.commit() if CONFIG_BROKER["use_aws"]: zip_folder = CONFIG_BROKER["zip_folder"] + "/" s3_client = boto3.client('s3', region_name=CONFIG_BROKER['aws_region']) response = s3_client.list_objects_v2(Bucket=CONFIG_BROKER['sf_133_bucket'], Prefix=zip_folder) for obj in response.get('Contents', []): if obj['Key'] != zip_folder: zip_4_file_path = s3_client.generate_presigned_url('get_object', {'Bucket': CONFIG_BROKER['sf_133_bucket'], 'Key': obj['Key']}, ExpiresIn=600) parse_zip4_file(urllib.request.urlopen(zip_4_file_path), sess) # parse remaining 5 digit zips that weren't in the first file citystate_file = s3_client.generate_presigned_url('get_object', {'Bucket': CONFIG_BROKER['sf_133_bucket'], 'Key': "ctystate.txt"}, ExpiresIn=600) parse_citystate_file(urllib.request.urlopen(citystate_file), sess) census_file = s3_client.generate_presigned_url('get_object', {'Bucket': CONFIG_BROKER['sf_133_bucket'], 'Key': "census_congressional_districts.csv"}, ExpiresIn=600) else: base_path = os.path.join(CONFIG_BROKER["path"], "dataactvalidator", "config", CONFIG_BROKER["zip_folder"]) # creating the list while ignoring hidden files on mac file_list = [f for f in os.listdir(base_path) if not re.match('^\.', f)] for file in file_list: parse_zip4_file(open(os.path.join(base_path, file)), sess) # parse remaining 5 digit zips that weren't in the first file citystate_file = os.path.join(CONFIG_BROKER["path"], "dataactvalidator", "config", "ctystate.txt") parse_citystate_file(open(citystate_file), sess) census_file = os.path.join(base_path, "census_congressional_districts.csv") hot_swap_zip_tables(sess) update_state_congr_table_current(sess) update_state_congr_table_census(census_file, sess) logger.info("Zipcode script complete")
def load_fields(file_type_name, schema_file_name): """Load specified schema from a .csv.""" with create_app().app_context(): sess = GlobalDB.db().session # get file type object for specified fileTypeName file_type = sess.query(FileType).filter(FileType.name == file_type_name).one() # delete existing schema from database SchemaLoader.remove_columns_by_file_type(sess, file_type) # get allowable datatypes type_query = sess.query(FieldType.name, FieldType.field_type_id).all() types = {data_type.name: data_type.field_type_id for data_type in type_query} # add schema to database with open(schema_file_name, 'rU') as csvfile: reader = csv.DictReader(csvfile) file_column_count = 0 for record in reader: record = FieldCleaner.clean_record(record) fields = ["fieldname", "required", "data_type"] if all(field in record for field in fields): SchemaLoader.add_column_by_file_type( sess, types, file_type, FieldCleaner.clean_string(record["fieldname"]), FieldCleaner.clean_string(record["fieldname_short"]), record["required"], record["data_type"], record["padded_flag"], record["field_length"]) file_column_count += 1 else: raise ValueError('CSV File does not follow schema') sess.commit() logger.info({ 'message': '{} {} schema records added to {}'.format(file_column_count, file_type_name, FileColumn.__tablename__), 'message_type': 'ValidatorInfo', 'file_type': file_type.letter_name })
def load_labels(cls, filename): """Load non-SQL-based validation rules to db.""" with create_app().app_context(): sess = GlobalDB.db().session # Delete all records currently in table sess.query(ValidationLabel).delete() filename = os.path.join(cls.validation_labels_path, filename) # open csv with open(filename, 'rU') as csvfile: # read header header = csvfile.readline() # split header into filed names raw_field_names = header.split(',') field_names = [] # clean field names for field in raw_field_names: field_names.append(FieldCleaner.clean_string(field)) unknown_fields = set(field_names) - set(cls.headers) if len(unknown_fields) != 0: raise KeyError("".join(["Found unexpected fields: ", str(list(unknown_fields))])) missing_fields = set(cls.headers) - set(field_names) if len(missing_fields) != 0: raise ValueError("".join(["Missing required fields: ", str(list(missing_fields))])) reader = csv.DictReader(csvfile, fieldnames=field_names) for row in reader: validation_label = ValidationLabel(label=row['label'], error_message=row['error_message'], column_name=row['column_name'], label_type=row['label_type']) # look up file type id try: file_id = FILE_TYPE_DICT[row["file_type"]] except Exception as e: raise Exception("{}: file type={}, rule label={}. Rule not loaded.".format( e, row["file_type"], row["rule_label"])) validation_label.file_id = file_id sess.merge(validation_label) sess.commit()
def load_offices(load_office=None): """Load TAS file into broker database. """ # read office file to dataframe, to make sure all is well # with the file before firing up a db transaction if not load_office: if CONFIG_BROKER["use_aws"]: s3connection = boto.s3.connect_to_region( CONFIG_BROKER['aws_region']) s3bucket = s3connection.lookup(CONFIG_BROKER['sf_133_bucket']) load_office = s3bucket.get_key( "FPDSNG_Contracting_Offices.csv").generate_url(expires_in=600) else: load_office = os.path.join(CONFIG_BROKER["path"], "dataactvalidator", "config", "FPDSNG_Contracting_Offices.csv") with create_app().app_context(): update_offices(load_office)
def load_submission_window_schedule(): """ Loads the submission window schedule data. """ if CONFIG_BROKER["use_aws"]: s3_client = boto3.client('s3', region_name=CONFIG_BROKER['aws_region']) sub_schedule_file = s3_client.generate_presigned_url( 'get_object', { 'Bucket': CONFIG_BROKER['sf_133_bucket'], 'Key': "submission_window_schedule.csv" }, ExpiresIn=600) else: sub_schedule_file = os.path.join(CONFIG_BROKER['path'], 'dataactvalidator', 'config', 'submission_window_schedule.csv') logger.info('Loading submission window schedule data') with create_app().app_context(): data = pd.read_csv(sub_schedule_file, dtype=str) data = clean_data( data, SubmissionWindowSchedule, { 'year': 'year', 'period': 'period', 'period_start': 'period_start', 'publish_deadline': 'publish_deadline', 'certification_deadline': 'certification_deadline' }, {}) # Add a day to the deadlines because the dates in the file are supposed to be inclusive data['publish_deadline'] = data.apply( lambda x: add_day(x, 'publish_deadline'), axis=1) data['certification_deadline'] = data.apply( lambda x: add_day(x, 'certification_deadline'), axis=1) sess = GlobalDB.db().session # delete any data in the SubmissionWindowSchedule table sess.query(SubmissionWindowSchedule).delete() # insert data into table num = insert_dataframe(data, SubmissionWindowSchedule.__table__.name, sess.connection()) logger.info( '{} records inserted to submission_window_schedule'.format(num)) sess.commit()
def load_sql(cls, filename): """ Load SQL-based validation rules to db. """ with create_app().app_context(): sess = GlobalDB.db().session filename = os.path.join(cls.sql_rules_path, filename) # Initial load sql_data = pd.read_csv(filename, dtype=str, usecols=cls.headers) sql_data = clean_data( sql_data, RuleSql, {'rule_label': 'rule_label', 'rule_error_message': 'rule_error_message', 'query_name': 'query_name', 'expected_value': 'expected_value', 'category': 'category', 'file_type': 'file_type', 'target_file': 'target_file', 'rule_cross_file_flag': 'rule_cross_file_flag', 'severity_name': 'severity_name'}, {} ) # Processing certain values sql_data['rule_sql'] = sql_data['query_name'].apply(lambda name: cls.read_sql_str(name)) sql_data['file_id'] = sql_data['file_type'].apply(lambda type: FILE_TYPE_DICT.get(type, None)) if sql_data['file_id'].isnull().values.any(): raise Exception('Invalid file_type value found in sqlLoader. Must be one of the following: {}' .format(', '.join(list(FILE_TYPE_DICT.keys())))) sql_data['target_file_id'] = sql_data['target_file'].apply(lambda type: FILE_TYPE_DICT.get(type, None)) sql_data['rule_cross_file_flag'] = sql_data['rule_cross_file_flag'].apply(lambda flag: flag in ('true', 't', 'y', 'yes')) sql_data['rule_severity_id'] = sql_data['severity_name'].apply(lambda severity_name: RULE_SEVERITY_DICT.get(severity_name, None)) if sql_data['rule_severity_id'].isnull().values.any(): raise Exception('Invalid severity_name value found in sqlLoader Must be one of the following: {}' .format(', '.join(list(RULE_SEVERITY_DICT.keys())))) sql_data.drop(['file_type', 'severity_name', 'target_file'], axis=1, inplace=True) # Final check if we need to actually reload if check_dataframe_diff(sql_data, RuleSql, del_cols=['rule_sql_id', 'created_at', 'updated_at'], sort_cols=['rule_label', 'file_id', 'target_file_id']): # Delete and reload all records currently in table logger.info('Detected changes in {}, deleting RuleSQL and reloading'.format(cls.sql_rules_path)) sess.query(RuleSql).delete() insert_dataframe(sql_data, RuleSql.__table__.name, sess.connection()) sess.commit() else: logger.info('No changes detected since last load. Skipping.')
def load_frec(file_name): """Load FREC (high-level agency names) lookup table.""" with create_app().app_context(): sess = GlobalDB.db().session models = {frec.frec_code: frec for frec in sess.query(FREC)} # read FREC values from csv data = pd.read_csv(file_name, dtype=str) # clean data data = clean_data( data, FREC, { "frec": "frec_code", "cgac_agency_code": "cgac_code", "frec_entity_description": "agency_name", "agency_abbreviation": "agency_abbreviation" }, { "frec": { "keep_null": False }, "cgac_code": { "pad_to_length": 3 }, "frec_code": { "pad_to_length": 4 } }) # de-dupe data.drop_duplicates(subset=['frec_code'], inplace=True) # create foreign key dicts cgac_dict = { str(cgac.cgac_code): cgac.cgac_id for cgac in sess.query(CGAC).filter( CGAC.cgac_code.in_(data["cgac_code"])).all() } # insert to db delete_missing_frecs(models, data) update_frecs(models, data, cgac_dict) sess.add_all(models.values()) sess.commit() logger.info('%s FREC records inserted', len(models))
def setUpClass(cls): """ Set up class-wide resources (test data) """ super(SettingsTests, cls).setUpClass() # TODO: refactor into a pytest fixture with create_app().app_context(): # get the submission test user sess = GlobalDB.db().session cls.session = sess cgac = CGAC(cgac_code='097') rule = RuleSql(rule_sql_id=1, rule_sql='', rule_label='FABS1', rule_error_message='', query_name='', file_id=1, rule_severity_id=2, rule_cross_file_flag=False) sess.add_all([cgac, rule]) sess.commit() default_setting = RuleSetting(agency_code='097', rule_label=rule.rule_label, file_id=rule.file_id, target_file_id=rule.target_file_id, priority=1, impact_id=1) sess.add(default_setting) sess.commit()
def read_zips(): with create_app().app_context(): sess = GlobalDB.db().session # delete old values in case something changed and one is now invalid sess.query(Zips).delete(synchronize_session=False) sess.commit() if CONFIG_BROKER["use_aws"]: s3connection = boto.s3.connect_to_region( CONFIG_BROKER['aws_region']) s3bucket = s3connection.lookup(CONFIG_BROKER['sf_133_bucket']) zip_folder = CONFIG_BROKER["zip_folder"] + "/" for key in s3bucket.list(prefix=zip_folder): if key.name != zip_folder: zip_4_file_path = key.generate_url(expires_in=600) parse_zip4_file(urllib.request.urlopen(zip_4_file_path), sess) # parse remaining 5 digit zips that weren't in the first file citystate_file = s3bucket.get_key("ctystate.txt").generate_url( expires_in=600) parse_citystate_file(urllib.request.urlopen(citystate_file), sess) else: base_path = os.path.join(CONFIG_BROKER["path"], "dataactvalidator", "config", CONFIG_BROKER["zip_folder"]) # creating the list while ignoring hidden files on mac file_list = [ f for f in os.listdir(base_path) if not re.match('^\.', f) ] for file in file_list: parse_zip4_file(open(os.path.join(base_path, file)), sess) # parse remaining 5 digit zips that weren't in the first file citystate_file = os.path.join(CONFIG_BROKER["path"], "dataactvalidator", "config", "ctystate.txt") parse_citystate_file(open(citystate_file), sess) update_state_congr_table(sess) logger.info("Zipcode script complete")
def setUpClass(cls): """ Set up class-wide resources (test data) """ super(DashboardTests, cls).setUpClass() # TODO: refactor into a pytest fixture with create_app().app_context(): # get the submission test user sess = GlobalDB.db().session cls.session = sess submission_user = sess.query(User).filter(User.email == cls.test_users['admin_user']).one() cls.submission_user_id = submission_user.user_id other_user = sess.query(User).filter(User.email == cls.test_users['agency_user']).one() cls.other_user_id = other_user.user_id no_submissions_user = sess.query(User).filter(User.email == cls.test_users['no_permissions_user']).one() cls.no_submissions_user_email = no_submissions_user.email cls.no_submissions_user_id = no_submissions_user.user_id cls.quarter_sub = insert_submission(cls.session, cls.submission_user_id, cgac_code='SYS', start_date='01/2017', end_date='03/2017', is_quarter=True)
def load_country_codes(filename): """Load country code lookup table.""" model = CountryCode with create_app().app_context(): sess = GlobalDB.db().session # for object class, delete and replace values sess.query(model).delete() data = pd.read_csv(filename, dtype=str) data = clean_data(data, model, { "country_code": "country_code", "country_name": "country_name" }, {}) # de-dupe data.drop_duplicates(subset=['country_code'], inplace=True) # insert to db table_name = model.__table__.name num = insert_dataframe(data, table_name, sess.connection()) sess.commit() logger.info('{} records inserted to {}'.format(num, table_name))
def setUpClass(cls): """Set up resources to be shared within a test class""" # TODO: refactor into a pytest class fixtures and inject as necessary # update application's db config options so unittests # run against test databases configure_logging() suite = cls.__name__.lower() config = dataactcore.config.CONFIG_DB cls.num = randint(1, 9999) config['db_name'] = 'unittest{}_{}_data_broker'.format(cls.num, suite) dataactcore.config.CONFIG_DB = config create_database(CONFIG_DB['db_name']) run_migrations() app = create_app() app.config['TESTING'] = True app.config['DEBUG'] = False cls.app = TestApp(app) # Allow us to augment default test failure msg w/ more detail cls.longMessage = True # Upload files to S3 (False = skip re-uploading on subsequent runs) cls.uploadFiles = True # Run tests for local broker or not cls.local = CONFIG_BROKER['local'] # This needs to be set to the local directory for error reports if local is True cls.local_file_directory = CONFIG_SERVICES['error_report_path'] # drop and re-create test job db/tables setup_job_tracker_db() # drop and re-create test error db/tables setup_error_db() # drop and re-create test validation db setup_validation_db() cls.userId = None # constants to use for default submission start and end dates cls.SUBMISSION_START_DEFAULT = datetime(2015, 10, 1) cls.SUBMISSION_END_DEFAULT = datetime(2015, 10, 31)
def setUpClass(cls): """Set up resources to be shared within a test class""" # TODO: refactor into a pytest class fixtures and inject as necessary # update application's db config options so unittests # run against test databases suite = cls.__name__.lower() config = dataactcore.config.CONFIG_DB cls.num = randint(1, 9999) config['db_name'] = 'unittest{}_{}_data_broker'.format(cls.num, suite) dataactcore.config.CONFIG_DB = config create_database(CONFIG_DB['db_name']) run_migrations() app = create_app() app.config['TESTING'] = True app.config['DEBUG'] = False cls.app = TestApp(app) # Allow us to augment default test failure msg w/ more detail cls.longMessage = True # Upload files to S3 (False = skip re-uploading on subsequent runs) cls.uploadFiles = True # Run tests for local broker or not cls.local = CONFIG_BROKER['local'] # This needs to be set to the local directory for error reports if local is True cls.local_file_directory = CONFIG_SERVICES['error_report_path'] # drop and re-create test job db/tables setup_job_tracker_db() # drop and re-create test error db/tables setup_error_db() # drop and re-create test validation db setup_validation_db() cls.userId = None # constants to use for default submission start and end dates cls.SUBMISSION_START_DEFAULT = datetime(2015, 10, 1) cls.SUBMISSION_END_DEFAULT = datetime(2015, 10, 31)
def setUpClass(cls): """Set up class-wide resources (test data)""" super(GenerationTests, cls).setUpClass() # TODO: refactor into a pytest fixture with create_app().app_context(): # get the submission test user sess = GlobalDB.db().session submission_user = sess.query(User).filter(User.email == cls.test_users['admin_user']).one() cls.submission_user_id = submission_user.user_id other_user = sess.query(User).filter(User.email == cls.test_users['agency_user']).one() cls.other_user_email = other_user.email cls.other_user_id = other_user.user_id # setup submission/jobs data for test_check_status cls.generation_submission_id = insert_submission(sess, cls.submission_user_id, cgac_code="SYS", start_date="07/2015", end_date="09/2015", is_quarter=True) cls.setup_file_generation_submission(sess) cls.test_fabs_submission_id = insert_submission(sess, cls.submission_user_id, cgac_code="SYS", start_date="10/2015", end_date="12/2015", is_quarter=False, number_of_errors=0, is_fabs=True)
def setUpClass(cls): """Set up class-wide resources (test data)""" super(GenerationTests, cls).setUpClass() # TODO: refactor into a pytest fixture with create_app().app_context(): # get the submission test user sess = GlobalDB.db().session submission_user = sess.query(User).filter(User.email == cls.test_users['admin_user']).one() cls.submission_user_id = submission_user.user_id other_user = sess.query(User).filter(User.email == cls.test_users['agency_user']).one() cls.other_user_email = other_user.email cls.other_user_id = other_user.user_id # setup submission/jobs data for test_check_status cls.generation_submission_id = insert_submission(sess, cls.submission_user_id, cgac_code="SYS", start_date="07/2015", end_date="09/2015", is_quarter=True) cls.setup_file_generation_submission(sess) cls.test_fabs_submission_id = insert_submission(sess, cls.submission_user_id, cgac_code="SYS", start_date="10/2015", end_date="12/2015", is_quarter=False, number_of_errors=0, is_fabs=True)
def load_quarterly_threshold(): """ Loads the quarterly revalidation threshold data. """ if CONFIG_BROKER["use_aws"]: s3_client = boto3.client('s3', region_name=CONFIG_BROKER['aws_region']) threshold_file = s3_client.generate_presigned_url( 'get_object', { 'Bucket': CONFIG_BROKER['sf_133_bucket'], 'Key': "quarterly_submission_starts.csv" }, ExpiresIn=600) else: threshold_file = os.path.join(CONFIG_BROKER["path"], "dataactvalidator", "config", "quarterly_submission_starts.csv") logger.info('Loading quarterly revalidation threshold data') with create_app().app_context(): data = pd.read_csv(threshold_file, dtype=str) data = clean_data(data, QuarterlyRevalidationThreshold, { "year": "year", "quarter": "quarter", "window_start": "window_start" }, {}) sess = GlobalDB.db().session # delete any data in the QuarterlyRevalidationThreshold table sess.query(QuarterlyRevalidationThreshold).delete() # insert data into table num = insert_dataframe(data, QuarterlyRevalidationThreshold.__table__.name, sess.connection()) logger.info( '{} records inserted to quarterly_revalidation_threshold'.format( num)) sess.commit()
def load_sub_tier_agencies(file_name): """Load Sub Tier Agency (sub_tier-level agency names) lookup table.""" with create_app().app_context(): sess = GlobalDB.db().session models = {sub_tier_agency.sub_tier_agency_code: sub_tier_agency for sub_tier_agency in sess.query(SubTierAgency)} # read Sub Tier Agency values from csv data = pd.read_csv(file_name, dtype=str) condition = data["FPDS DEPARTMENT ID"] == data["SUBTIER CODE"] data.loc[condition, "PRIORITY"] = 1 data.loc[~condition, "PRIORITY"] = 2 # clean data data = clean_data( data, SubTierAgency, {"cgac_agency_code": "cgac_code", "subtier_code": "sub_tier_agency_code", "priority": "priority", "frec": "frec_code", "subtier_name": "sub_tier_agency_name", "is_frec": "is_frec"}, {"cgac_code": {"pad_to_length": 3}, "frec_code": {"pad_to_length": 4}, "sub_tier_agency_code": {"pad_to_length": 4}} ) # de-dupe data.drop_duplicates(subset=['sub_tier_agency_code'], inplace=True) # create foreign key dicts cgac_dict = {str(cgac.cgac_code): cgac.cgac_id for cgac in sess.query(CGAC).filter(CGAC.cgac_code.in_(data["cgac_code"])).all()} frec_dict = {str(frec.frec_code): frec.frec_id for frec in sess.query(FREC).filter(FREC.frec_code.in_(data["frec_code"])).all()} delete_missing_sub_tier_agencies(models, data) update_sub_tier_agencies(models, data, cgac_dict, frec_dict) sess.add_all(models.values()) sess.commit() logger.info('%s Sub Tier Agency records inserted', len(models))
def load_object_class(base_path): """ This function loads Object classes into the database Args: base_path: directory that contains the domain values files. """ if CONFIG_BROKER["use_aws"]: s3connection = boto.s3.connect_to_region(CONFIG_BROKER['aws_region']) s3bucket = s3connection.lookup(CONFIG_BROKER['sf_133_bucket']) filename = s3bucket.get_key("object_class.csv").generate_url( expires_in=600) else: filename = os.path.join(base_path, "object_class.csv") # Load object class lookup table logger.info('Loading Object Class File: object_class.csv') with create_app().app_context(): sess = GlobalDB.db().session sess.query(ObjectClass).delete() data = pd.read_csv(filename, dtype=str) data = clean_data( data, ObjectClass, { "max_oc_code": "object_class_code", "max_object_class_name": "object_class_name" }, {"object_class_code": { "pad_to_length": 3 }}) # de-dupe data.drop_duplicates(subset=['object_class_code'], inplace=True) # insert to db table_name = ObjectClass.__table__.name num = insert_dataframe(data, table_name, sess.connection()) sess.commit() logger.info('{} records inserted to {}'.format(num, table_name))
def load_country_codes(base_path): """ Load Country Codes into the database. Args: base_path: directory that contains the domain values files. """ if CONFIG_BROKER["use_aws"]: s3_client = boto3.client('s3', region_name=CONFIG_BROKER['aws_region']) filename = s3_client.generate_presigned_url('get_object', {'Bucket': CONFIG_BROKER['sf_133_bucket'], 'Key': "country_codes.csv"}, ExpiresIn=600) else: filename = os.path.join(base_path, "country_codes.csv") logger.info('Loading country codes file: country_codes.csv') with create_app().app_context(): sess = GlobalDB.db().session # for object class, delete and replace values sess.query(CountryCode).delete() data = pd.read_csv(filename, dtype=str) data = clean_data( data, CountryCode, {"country_code": "country_code", "country_name": "country_name"}, {} ) # de-dupe data.drop_duplicates(subset=['country_code'], inplace=True) # insert to db table_name = CountryCode.__table__.name num = insert_dataframe(data, table_name, sess.connection()) sess.commit() logger.info('{} records inserted to {}'.format(num, table_name))
def load_agency_data(base_path): """ Load agency data into the database Args: base_path: directory that contains the agency files """ if CONFIG_BROKER["use_aws"]: s3_client = boto3.client('s3', region_name=CONFIG_BROKER['aws_region']) agency_list_file = s3_client.generate_presigned_url('get_object', {'Bucket': CONFIG_BROKER['sf_133_bucket'], 'Key': "agency_list.csv"}, ExpiresIn=600) cascading_agency_list_file = s3_client.generate_presigned_url('get_object', {'Bucket': CONFIG_BROKER['sf_133_bucket'], 'Key': "agency_codes_list.csv"}, ExpiresIn=600) else: agency_list_file = os.path.join(base_path, "agency_list.csv") cascading_agency_list_file = os.path.join(base_path, "agency_codes_list.csv") with create_app().app_context(): logger.info('Loading CGAC') load_cgac(agency_list_file) logger.info('Loading FREC') load_frec(cascading_agency_list_file) logger.info('Loading Sub Tier Agencies') load_sub_tier_agencies(cascading_agency_list_file)
def load_sql(cls, filename): """Load SQL-based validation rules to db.""" with create_app().app_context(): sess = GlobalDB.db().session # Delete all records currently in table sess.query(RuleSql).delete() filename = os.path.join(cls.sql_rules_path, filename) # open csv with open(filename, 'rU') as csvfile: # read header header = csvfile.readline() # split header into filed names raw_field_names = header.split(',') field_names = [] # clean field names for field in raw_field_names: field_names.append(FieldCleaner.clean_string(field)) unknown_fields = set(field_names) - set(cls.headers) if len(unknown_fields) != 0: raise KeyError("".join([ "Found unexpected fields: ", str(list(unknown_fields)) ])) missing_fields = set(cls.headers) - set(field_names) if len(missing_fields) != 0: raise ValueError("".join([ "Missing required fields: ", str(list(missing_fields)) ])) reader = csv.DictReader(csvfile, fieldnames=field_names) for row in reader: sql = cls.read_sql_str(row['query_name']) rule_sql = RuleSql( rule_sql=sql, rule_label=row['rule_label'], rule_error_message=row['rule_error_message'], query_name=row['query_name']) # look up file type id try: file_id = FILE_TYPE_DICT[row["file_type"]] except Exception as e: raise Exception( "{}: file type={}, rule label={}. Rule not loaded." .format(e, row["file_type"], row["rule_label"])) try: if row["target_file"].strip() == "": # No target file provided target_file_id = None else: target_file_id = FILE_TYPE_DICT[row["target_file"]] except Exception as e: raise Exception( "{}: file type={}, rule label={}. Rule not loaded." .format(e, row["target_file"], row["rule_label"])) # set cross file flag flag = FieldCleaner.clean_string( row["rule_cross_file_flag"]) if flag in ('true', 't', 'y', 'yes'): cross_file_flag = True else: cross_file_flag = False rule_sql.rule_severity_id = RULE_SEVERITY_DICT[ row['severity_name']] rule_sql.file_id = file_id rule_sql.target_file_id = target_file_id rule_sql.rule_cross_file_flag = cross_file_flag sess.merge(rule_sql) sess.commit()
'award_procurement': { 'staging_table': AwardProcurement, 'certified_table': CertifiedAwardProcurement, 'staging_id': 'award_procurement_id', 'certified_id': 'certified_award_procurement_id', 'file_type_id': FILE_TYPE_DICT['award_procurement'] }, 'award_financial_assistance': { 'staging_table': AwardFinancialAssistance, 'certified_table': CertifiedAwardFinancialAssistance, 'staging_id': 'award_financial_assistance_id', 'certified_id': 'certified_award_financial_assistance_id', 'file_type_id': FILE_TYPE_DICT['award'] } } for award_type, award_dict in aw_data_map.items(): copy_certified_submission_award_data(award_dict['staging_table'], award_dict['certified_table'], award_dict['staging_id']) load_updated_award_data( award_dict['staging_table'], award_dict['certified_table'], award_dict['file_type_id'], shared_internal_cols + [award_dict['certified_id']]) if __name__ == '__main__': configure_logging() with create_app().app_context(): main()
def uncache_file_requests(): logger.info('Un-caching file generation requests') with create_app().app_context(): sess = GlobalDB.db().session sess.query(FileRequest).update({"is_cached_file": False}, synchronize_session=False) sess.commit()
def parse_sam_file(file_path, sess, monthly=False, benchmarks=False, table=DUNS, year=None): """ Takes in a SAM file and adds the DUNS data to the database Args: file_path: the path to the SAM file sess: the database connection monthly: whether it's a monthly file benchmarks: whether to log times table: the table to work from (could be DUNS/HistoricParentDuns) year: the year associated with the data (primarily for HistoricParentDUNS loads) """ parse_start_time = time.time() logger.info("Starting file " + str(file_path)) dat_file_name = os.path.splitext(os.path.basename(file_path))[0]+'.dat' sam_file_type = "MONTHLY" if monthly else "DAILY" dat_file_date = re.findall(".*{}_(.*).dat".format(sam_file_type), dat_file_name)[0] with create_app().app_context(): column_header_mapping = { "awardee_or_recipient_uniqu": 0, "sam_extract_code": 4, "registration_date": 6, "expiration_date": 7, "last_sam_mod_date": 8, "activation_date": 9, "legal_business_name": 10, "dba_name": 11, "address_line_1": 14, "address_line_2": 15, "city": 16, "state": 17, "zip": 18, "zip4": 19, "country_code": 20, "congressional_district": 21, "entity_structure": 27, "business_types_raw": 31, "ultimate_parent_legal_enti": 186, "ultimate_parent_unique_ide": 187 } column_header_mapping_ordered = OrderedDict(sorted(column_header_mapping.items(), key=lambda c: c[1])) # Initial sweep of the file to see rows and possibly what DUNS we're updating if benchmarks: initial_sweep = time.time() nrows = 0 with zipfile.ZipFile(file_path) as zip_file: with zip_file.open(dat_file_name) as dat_file: nrows = len(dat_file.readlines()) if benchmarks: logger.info("Initial sweep took {} seconds".format(time.time() - initial_sweep)) block_size = 10000 batches = (nrows-1)//block_size # skip the first line again if the last batch is also the first batch skiplastrows = 2 if batches == 0 else 1 last_block_size = ((nrows % block_size) or block_size)-skiplastrows batch = 0 added_rows = 0 while batch <= batches: skiprows = 1 if batch == 0 else (batch*block_size) nrows = (((batch+1)*block_size)-skiprows) if (batch < batches) else last_block_size logger.info('Loading rows %s to %s', skiprows+1, nrows+skiprows) with zipfile.ZipFile(file_path) as zip_file: with zip_file.open(dat_file_name) as dat_file: csv_data = pd.read_csv(dat_file, dtype=str, header=None, skiprows=skiprows, nrows=nrows, sep='|', usecols=column_header_mapping_ordered.values(), names=column_header_mapping_ordered.keys(), quoting=3) # add deactivation_date column for delete records lambda_func = (lambda sam_extract: pd.Series([dat_file_date if sam_extract == "1" else np.nan])) csv_data = csv_data.assign(deactivation_date=pd.Series([np.nan], name='deactivation_date') if monthly else csv_data["sam_extract_code"].apply(lambda_func)) # convert business types string to array bt_func = (lambda bt_raw: pd.Series([[str(code) for code in str(bt_raw).split('~') if isinstance(bt_raw, str)]])) csv_data = csv_data.assign(business_types_codes=csv_data["business_types_raw"].apply(bt_func)) del csv_data["business_types_raw"] # removing rows where DUNS number isn't even provided csv_data = csv_data.where(csv_data["awardee_or_recipient_uniqu"].notnull()) # cleaning and replacing NaN/NaT with None's csv_data = clean_sam_data(csv_data.where(pd.notnull(csv_data), None), table=table) if monthly: logger.info("Adding all monthly data with bulk load") if benchmarks: bulk_month_load = time.time() del csv_data["sam_extract_code"] if year: csv_data['year'] = year insert_dataframe(csv_data, table.__table__.name, sess.connection()) if benchmarks: logger.info("Bulk month load took {} seconds".format(time.time()-bulk_month_load)) else: add_data = csv_data[csv_data.sam_extract_code == '2'] update_delete_data = csv_data[(csv_data.sam_extract_code == '3') | (csv_data.sam_extract_code == '1')] for dataframe in [add_data, update_delete_data]: del dataframe["sam_extract_code"] if not add_data.empty: try: logger.info("Attempting to bulk load add data") insert_dataframe(add_data, table.__table__.name, sess.connection()) except IntegrityError: logger.info("Bulk loading add data failed, loading add data by row") sess.rollback() models, activated_models = get_relevant_models(add_data, sess, benchmarks=benchmarks) logger.info("Loading add data ({} rows)".format(len(add_data.index))) load_duns_by_row(add_data, sess, models, activated_models, benchmarks=benchmarks, table=table) if not update_delete_data.empty: models, activated_models = get_relevant_models(update_delete_data, sess, benchmarks=benchmarks) logger.info("Loading update_delete data ({} rows)".format(len(update_delete_data.index))) load_duns_by_row(update_delete_data, sess, models, activated_models, benchmarks=benchmarks, table=table) sess.commit() added_rows += nrows batch += 1 logger.info('%s DUNS records inserted', added_rows) if benchmarks: logger.info("Parsing {} took {} seconds with {} rows".format(dat_file_name, time.time()-parse_start_time, added_rows))
def setUpClass(cls): """Set up class-wide resources (test data)""" super(ListSubmissionTests, cls).setUpClass() # TODO: refactor into a pytest fixture with create_app().app_context(): # get an admin and non-admin user sess = GlobalDB.db().session cls.session = sess admin_user = sess.query(User).filter( User.email == cls.test_users['admin_user']).one() cls.admin_user_id = admin_user.user_id other_user = sess.query(User).filter( User.email == cls.test_users['agency_user']).one() cls.other_user_id = other_user.user_id # set up submissions for dabs cls.non_admin_dabs_sub_id = insert_submission( sess, cls.other_user_id, cgac_code="SYS", start_date="10/2015", end_date="12/2015", is_quarter=True, is_fabs=False, publish_status_id=PUBLISH_STATUS_DICT['unpublished'], updated_at='01/01/2010') cls.admin_dabs_sub_id = insert_submission( sess, cls.admin_user_id, cgac_code="000", start_date="10/2015", end_date="12/2015", is_quarter=True, is_fabs=False, publish_status_id=PUBLISH_STATUS_DICT['unpublished'], updated_at='01/01/2012') cls.certified_dabs_sub_id = insert_submission( sess, cls.admin_user_id, cgac_code="SYS", start_date="10/2015", end_date="12/2015", is_quarter=True, is_fabs=False, publish_status_id=PUBLISH_STATUS_DICT['published']) # Add a couple jobs for dabs files insert_job(sess, FILE_TYPE_DICT['appropriations'], FILE_STATUS_DICT['complete'], JOB_TYPE_DICT['file_upload'], cls.non_admin_dabs_sub_id, filename='/path/to/test/file_1.csv', file_size=123, num_rows=3) insert_job(sess, FILE_TYPE_DICT['award'], FILE_STATUS_DICT['complete'], JOB_TYPE_DICT['file_upload'], cls.non_admin_dabs_sub_id, filename='/path/to/test/file_2.csv', file_size=123, num_rows=3) insert_job(sess, FILE_TYPE_DICT['award'], FILE_STATUS_DICT['complete'], JOB_TYPE_DICT['file_upload'], cls.certified_dabs_sub_id, filename='/path/to/test/file_part_2.csv', file_size=123, num_rows=3) # set up submissions for fabs cls.non_admin_fabs_sub_id = insert_submission( sess, cls.admin_user_id, cgac_code="SYS", start_date="10/2015", end_date="12/2015", is_fabs=True, publish_status_id=PUBLISH_STATUS_DICT['unpublished']) cls.admin_fabs_sub_id = insert_submission( sess, cls.other_user_id, cgac_code="000", start_date="10/2015", end_date="12/2015", is_fabs=True, publish_status_id=PUBLISH_STATUS_DICT['unpublished']) cls.published_fabs_sub_id = insert_submission( sess, cls.other_user_id, cgac_code="000", start_date="10/2015", end_date="12/2015", is_fabs=True, publish_status_id=PUBLISH_STATUS_DICT['published']) # Add a job for a FABS submission insert_job(sess, FILE_TYPE_DICT['fabs'], FILE_STATUS_DICT['complete'], JOB_TYPE_DICT['file_upload'], cls.admin_fabs_sub_id, filename=str(cls.admin_fabs_sub_id) + '/test_file.csv', file_size=123, num_rows=3)
def parse_sam_file(file_path, sess, monthly=False, benchmarks=False): parse_start_time = time.time() logger.info("starting file " + str(file_path)) dat_file_name = os.path.splitext(os.path.basename(file_path))[0]+'.dat' sam_file_type = "MONTHLY" if monthly else "DAILY" dat_file_date = re.findall(".*{}_(.*).dat".format(sam_file_type), dat_file_name)[0] with create_app().app_context(): column_header_mapping = { "awardee_or_recipient_uniqu": 0, "sam_extract_code": 4, "expiration_date": 7, "last_sam_mod_date": 8, "activation_date": 9, "legal_business_name": 10 } column_header_mapping_ordered = OrderedDict(sorted(column_header_mapping.items(), key=lambda c: c[1])) # Initial sweep of the file to see rows and possibly what DUNS we're updating if benchmarks: initial_sweep = time.time() nrows = 0 with zipfile.ZipFile(file_path) as zip_file: with zip_file.open(dat_file_name) as dat_file: nrows = len(dat_file.readlines()) if benchmarks: logger.info("Initial sweep took {} seconds".format(time.time() - initial_sweep)) block_size = 10000 batches = nrows//block_size # skip the first line again if the last batch is also the first batch skiplastrows = 2 if batches == 0 else 1 last_block_size = (nrows % block_size)-skiplastrows batch = 0 added_rows = 0 while batch <= batches: skiprows = 1 if batch == 0 else (batch*block_size) nrows = (((batch+1)*block_size)-skiprows) if (batch < batches) else last_block_size logger.info('loading rows %s to %s', skiprows+1, nrows+skiprows) with zipfile.ZipFile(file_path) as zip_file: with zip_file.open(dat_file_name) as dat_file: csv_data = pd.read_csv(dat_file, dtype=str, header=None, skiprows=skiprows, nrows=nrows, sep='|', usecols=column_header_mapping_ordered.values(), names=column_header_mapping_ordered.keys()) # add deactivation_date column for delete records lambda_func = (lambda sam_extract: pd.Series([dat_file_date if sam_extract == "1" else np.nan])) csv_data = csv_data.assign(deactivation_date=pd.Series([np.nan], name='deactivation_date') if monthly else csv_data["sam_extract_code"].apply(lambda_func)) # removing rows where DUNS number isn't even provided csv_data = csv_data.where(csv_data["awardee_or_recipient_uniqu"].notnull()) # cleaning and replacing NaN/NaT with None's csv_data = clean_sam_data(csv_data.where(pd.notnull(csv_data), None)) if monthly: logger.info("adding all monthly data with bulk load") if benchmarks: bulk_month_load = time.time() del csv_data["sam_extract_code"] insert_dataframe(csv_data, DUNS.__table__.name, sess.connection()) if benchmarks: logger.info("Bulk month load took {} seconds".format(time.time()-bulk_month_load)) else: add_data = csv_data[csv_data.sam_extract_code == '2'] update_delete_data = csv_data[(csv_data.sam_extract_code == '3') | (csv_data.sam_extract_code == '1')] for dataframe in [add_data, update_delete_data]: del dataframe["sam_extract_code"] if not add_data.empty: try: logger.info("attempting to bulk load add data") insert_dataframe(add_data, DUNS.__table__.name, sess.connection()) except IntegrityError: logger.info("bulk loading add data failed, loading add data by row") sess.rollback() models, activated_models = get_relevant_models(add_data, benchmarks=benchmarks) logger.info("loading add data ({} rows)".format(len(add_data.index))) load_duns_by_row(add_data, sess, models, activated_models, benchmarks=benchmarks) if not update_delete_data.empty: models, activated_models = get_relevant_models(update_delete_data, benchmarks=benchmarks) logger.info("loading update_delete data ({} rows)".format(len(update_delete_data.index))) load_duns_by_row(update_delete_data, sess, models, activated_models, benchmarks=benchmarks) sess.commit() added_rows += nrows batch += 1 logger.info('%s DUNS records inserted', added_rows) if benchmarks: logger.info("Parsing {} took {} seconds with {} rows".format(dat_file_name, time.time()-parse_start_time, added_rows))
def read_zips(): with create_app().app_context(): sess = GlobalDB.db().session # delete old values in case something changed and one is now invalid sess.query(Zips).delete(synchronize_session=False) sess.commit() if CONFIG_BROKER["use_aws"]: zip_folder = CONFIG_BROKER["zip_folder"] + "/" s3_client = boto3.client('s3', region_name=CONFIG_BROKER['aws_region']) response = s3_client.list_objects_v2( Bucket=CONFIG_BROKER['sf_133_bucket'], Prefix=zip_folder) for obj in response.get('Contents', []): if obj['Key'] != zip_folder: zip_4_file_path = s3_client.generate_presigned_url( 'get_object', { 'Bucket': CONFIG_BROKER['sf_133_bucket'], 'Key': obj['Key'] }, ExpiresIn=600) parse_zip4_file(urllib.request.urlopen(zip_4_file_path), sess) # parse remaining 5 digit zips that weren't in the first file citystate_file = s3_client.generate_presigned_url( 'get_object', { 'Bucket': CONFIG_BROKER['sf_133_bucket'], 'Key': "ctystate.txt" }, ExpiresIn=600) parse_citystate_file(urllib.request.urlopen(citystate_file), sess) census_file = s3_client.generate_presigned_url( 'get_object', { 'Bucket': CONFIG_BROKER['sf_133_bucket'], 'Key': "census_congressional_districts.csv" }, ExpiresIn=600) else: base_path = os.path.join(CONFIG_BROKER["path"], "dataactvalidator", "config", CONFIG_BROKER["zip_folder"]) # creating the list while ignoring hidden files on mac file_list = [ f for f in os.listdir(base_path) if not re.match('^\.', f) ] for file in file_list: parse_zip4_file(open(os.path.join(base_path, file)), sess) # parse remaining 5 digit zips that weren't in the first file citystate_file = os.path.join(CONFIG_BROKER["path"], "dataactvalidator", "config", "ctystate.txt") parse_citystate_file(open(citystate_file), sess) census_file = os.path.join(base_path, "census_congressional_districts.csv") update_state_congr_table_current(sess) update_state_congr_table_census(census_file, sess) logger.info("Zipcode script complete")
def load_sf133(filename, fiscal_year, fiscal_period, force_sf133_load=False, metrics=None): """ Load SF 133 (budget execution report) lookup table. Args: filename: name/path of the file to read in fiscal_year: fiscal year of the file being loaded fiscal_period: fiscal period of the file being loaded force_sf133_load: boolean to indicate whether to force a reload of the data metrics: an object containing information for the metrics file """ if not metrics: metrics = {} with create_app().app_context(): sess = GlobalDB.db().session existing_records = sess.query(SF133).filter( SF133.fiscal_year == fiscal_year, SF133.period == fiscal_period) if force_sf133_load: # force a reload of this period's current data logger.info( 'Force SF 133 load: deleting existing records for %s %s', fiscal_year, fiscal_period) delete_count = existing_records.delete() logger.info('%s records deleted', delete_count) metrics['records_deleted'] += delete_count elif existing_records.count(): # if there's existing data & we're not forcing a load, skip logger.info( 'SF133 %s %s already in database (%s records). Skipping file.', fiscal_year, fiscal_period, existing_records.count()) return data = clean_sf133_data(filename, SF133) # Now that we've added zero lines for EVERY tas and SF 133 line number, get rid of the ones we don't actually # use in the validations. Arguably, it would be better just to include everything, but that drastically # increases the number of records we're inserting to the sf_133 table. If we ever decide that we need *all* # SF 133 lines that are zero value, remove the next two lines. sf_133_validation_lines = [ '1000', '1010', '1011', '1012', '1013', '1020', '1021', '1022', '1023', '1024', '1025', '1026', '1029', '1030', '1031', '1032', '1033', '1040', '1041', '1042', '1160', '1180', '1260', '1280', '1340', '1440', '1540', '1640', '1750', '1850', '1910', '2190', '2490', '2500', '3020', '4801', '4802', '4881', '4882', '4901', '4902', '4908', '4981', '4982' ] data = data[(data.line.isin(sf_133_validation_lines)) | (data.amount != 0)] # we didn't use the the 'keep_null' option when padding allocation transfer agency, because nulls in that column # break the pivot (see above comments). so, replace the ata '000' with an empty value before inserting to db data['allocation_transfer_agency'] = data[ 'allocation_transfer_agency'].str.replace('000', '') # make a pass through the dataframe, changing any empty values to None, to ensure that those are represented as # NULL in the db. data = data.applymap(lambda x: str(x).strip() if len(str(x).strip()) else None) # Keeping display_tas out here as it depends on empty allocation_transfer_agency being None and not 000 data['display_tas'] = data.apply( lambda row: concat_display_tas_dict(row), axis=1) # insert to db table_name = SF133.__table__.name num = insert_dataframe(data, table_name, sess.connection()) metrics['records_inserted'] += num update_tas_id(int(fiscal_year), int(fiscal_period)) sess.commit() logger.info('%s records inserted to %s', num, table_name)
def setUpClass(cls): """Set up class-wide resources (test data)""" super(ListLatestPublishedFileTests, cls).setUpClass() # TODO: refactor into a pytest fixture with create_app().app_context(): # get the submission test user sess = GlobalDB.db().session cls.session = sess other_user = sess.query(User).filter( User.email == cls.test_users['agency_user']).one() cls.other_user_email = other_user.email cls.other_user_id = other_user.user_id cls.submission_user_id = other_user.user_id # ======= Reference ====== cgac = CGAC(cgac_id=11, cgac_code='111', agency_name='CGAC 1') frec = FREC(frec_id=12, cgac_id=11, frec_code='2222', agency_name='FREC 2') cgac2 = CGAC(cgac_id=13, cgac_code='333', agency_name='CGAC 3') sess.add_all([cgac, frec, cgac2]) sess.commit() year = 2020 period = 6 diff_year = 2021 diff_period = 7 # ======= DABS ======= cls.dabs_sub_unpub = insert_submission( sess, cls.submission_user_id, cgac_code=cgac2.cgac_code, reporting_fiscal_year=1999, reporting_fisacal_period=2, publish_status_id=PUBLISH_STATUS_DICT['unpublished'], is_fabs=False) cls.dabs_sub_pub_twice = insert_submission( sess, cls.submission_user_id, cgac_code=cgac.cgac_code, reporting_fiscal_year=year, reporting_fisacal_period=period, publish_status_id=PUBLISH_STATUS_DICT['published'], is_fabs=False) cls.setup_published_submission(sess, cls.dabs_sub_pub_twice, date='01/01/2020', is_fabs=False) cls.setup_published_submission(sess, cls.dabs_sub_pub_twice, date='01/02/2020', is_fabs=False) cls.dabs_sub_pub_diff_agency = insert_submission( sess, cls.submission_user_id, frec_code=frec.frec_code, reporting_fiscal_year=year, reporting_fisacal_period=period, publish_status_id=PUBLISH_STATUS_DICT['published'], is_fabs=False) cls.setup_published_submission(sess, cls.dabs_sub_pub_diff_agency, is_fabs=False) cls.dabs_sub_pub_diff_year = insert_submission( sess, cls.submission_user_id, cgac_code=cgac.cgac_code, reporting_fiscal_year=diff_year, reporting_fisacal_period=period, publish_status_id=PUBLISH_STATUS_DICT['published'], is_fabs=False) cls.setup_published_submission(sess, cls.dabs_sub_pub_diff_year, is_fabs=False) cls.dabs_sub_pub_diff_period = insert_submission( sess, cls.submission_user_id, cgac_code=cgac.cgac_code, reporting_fiscal_year=year, reporting_fisacal_period=diff_period, publish_status_id=PUBLISH_STATUS_DICT['published'], is_fabs=False) cls.setup_published_submission(sess, cls.dabs_sub_pub_diff_period, is_fabs=False) # ======= FABS ======= cls.fabs_sub_unpub = insert_submission( sess, cls.submission_user_id, cgac_code='333', reporting_fiscal_year=None, reporting_fisacal_period=None, publish_status_id=1, is_fabs=True) cls.fabs_sub_pub = insert_submission( sess, cls.submission_user_id, cgac_code=cgac.cgac_code, reporting_fiscal_year=None, reporting_fisacal_period=None, publish_status_id=PUBLISH_STATUS_DICT['published'], is_fabs=True) cls.setup_published_submission(sess, cls.fabs_sub_pub, date='10/01/2000', is_fabs=True) cls.fabs_sub_pub_2 = insert_submission( sess, cls.submission_user_id, cgac_code=cgac.cgac_code, reporting_fiscal_year=None, reporting_fisacal_period=None, publish_status_id=PUBLISH_STATUS_DICT['published'], is_fabs=True) cls.setup_published_submission(sess, cls.fabs_sub_pub_2, date='10/02/2000', is_fabs=True) cls.fabs_sub_pub_diff_agency = insert_submission( sess, cls.submission_user_id, frec_code=frec.frec_code, reporting_fiscal_year=None, reporting_fisacal_period=None, publish_status_id=PUBLISH_STATUS_DICT['published'], is_fabs=True) cls.setup_published_submission(sess, cls.fabs_sub_pub_diff_agency, date='10/01/2000', is_fabs=True) cls.fabs_sub_pub_diff_year = insert_submission( sess, cls.submission_user_id, cgac_code=cgac.cgac_code, reporting_fiscal_year=None, reporting_fisacal_period=None, publish_status_id=PUBLISH_STATUS_DICT['published'], is_fabs=True) cls.setup_published_submission(sess, cls.fabs_sub_pub_diff_year, date='10/01/2001', is_fabs=True) cls.fabs_sub_pub_diff_period = insert_submission( sess, cls.submission_user_id, cgac_code=cgac.cgac_code, reporting_fiscal_year=None, reporting_fisacal_period=None, publish_status_id=PUBLISH_STATUS_DICT['published'], is_fabs=True) cls.setup_published_submission(sess, cls.fabs_sub_pub_diff_period, date='01/01/2001', is_fabs=True)
def load_cfda_program(base_path, load_local=False, local_file_name="cfda_program.csv"): """ Load cfda program. Args: base_path: directory that contains the cfda values files. load_local: boolean indicating whether to load from a local file or not local_file_name: the name of the file if loading locally """ local_now = datetime.now() if not load_local: logger.info("Fetching CFDA file from {}".format(S3_CFDA_FILE)) tmp_name = str(time.time()).replace(".", "") + "_cfda_program.csv" filename = os.path.join(base_path, tmp_name) r = requests.get(S3_CFDA_FILE, allow_redirects=True) open(filename, 'wb').write(r.content) else: filename = os.path.join(base_path, local_file_name) logger.info('Loading CFDA program file: ' + filename) model = CFDAProgram metrics_json = { 'script_name': 'load_cfda_data.py', 'start_time': str(local_now), 'new_records': 0 } def fix_program_number(row, decimals=3): multiplier = 10 ** decimals value = math.floor(row['program_number'] * multiplier + 0.5) / multiplier return str(value).ljust(6, '0') with create_app().app_context(): configure_logging() sess = GlobalDB.db().session import_data = pd.read_csv(filename, dtype=str, encoding='cp1252', na_filter=False) import_data = clean_data( import_data, model, DATA_CLEANING_MAP, {} ) import_data["published_date"] = format_date(import_data["published_date"]) import_data["archived_date"] = format_date(import_data["archived_date"]) table_name = model.__table__.name # Check if there is new data to load new_data = check_dataframe_diff(import_data, model, ['cfda_program_id'], ['program_number'], lambda_funcs=[('program_number', fix_program_number)]) if new_data: # insert to db sess.query(model).delete() num = insert_dataframe(import_data, table_name, sess.connection()) sess.commit() # If we've updated the data at all, update the external data load date update_external_data_load_date(local_now, datetime.now(), 'cfda') if not load_local: os.remove(filename) if new_data: logger.info('{} records inserted to {}'.format(num, table_name)) metrics_json['new_records'] = num else: logger.info("Skipped cfda load, no new data.") sys.exit(3) metrics_json['duration'] = str(datetime.now() - local_now) with open('load_cfda_data_metrics.json', 'w+') as metrics_file: json.dump(metrics_json, metrics_file)
def setUpClass(cls): """Set up class-wide resources (test data)""" super(ListSubmissionTests, cls).setUpClass() # TODO: refactor into a pytest fixture with create_app().app_context(): # get an admin and non-admin user sess = GlobalDB.db().session cls.session = sess admin_user = sess.query(User).filter( User.email == cls.test_users['admin_user']).one() cls.admin_user_id = admin_user.user_id other_user = sess.query(User).filter( User.email == cls.test_users['agency_user']).one() cls.other_user_id = other_user.user_id # set up submissions for dabs cls.non_admin_dabs_sub_id = insert_submission( sess, cls.other_user_id, cgac_code='SYS', start_date='10/2015', end_date='12/2015', is_quarter=True, is_fabs=False, publish_status_id=PUBLISH_STATUS_DICT['unpublished'], updated_at='01/01/2010') cls.admin_dabs_sub_id = insert_submission( sess, cls.admin_user_id, cgac_code='000', start_date='10/2015', end_date='12/2015', is_quarter=True, is_fabs=False, publish_status_id=PUBLISH_STATUS_DICT['unpublished'], updated_at='01/01/2012') cls.test_sub_id = insert_submission( sess, cls.admin_user_id, cgac_code='SYS', start_date='10/2015', end_date='12/2015', is_quarter=True, is_fabs=False, publish_status_id=PUBLISH_STATUS_DICT['unpublished'], updated_at='01/02/2012', test_submission=True) # This is the min date, but the date everything should be using is the one in the job (MAX_UPDATED_AT) cls.certified_dabs_sub_id = insert_submission( sess, cls.admin_user_id, cgac_code='SYS', start_date='10/2015', end_date='12/2015', is_quarter=True, is_fabs=False, publish_status_id=PUBLISH_STATUS_DICT['published'], updated_at='01/01/2000') # Add a couple jobs for dabs files, make sure the updated at is the same as or earlier than the one on # the submission itself insert_job(sess, FILE_TYPE_DICT['appropriations'], FILE_STATUS_DICT['complete'], JOB_TYPE_DICT['file_upload'], cls.non_admin_dabs_sub_id, filename='/path/to/test/file_1.csv', file_size=123, num_rows=3, updated_at='01/01/2009') insert_job(sess, FILE_TYPE_DICT['award'], FILE_STATUS_DICT['complete'], JOB_TYPE_DICT['file_upload'], cls.non_admin_dabs_sub_id, filename='/path/to/test/file_2.csv', file_size=123, num_rows=3, updated_at='01/01/2009') # Min updated at date insert_job(sess, FILE_TYPE_DICT['award'], FILE_STATUS_DICT['complete'], JOB_TYPE_DICT['file_upload'], cls.certified_dabs_sub_id, filename='/path/to/test/file_part_2.csv', file_size=123, num_rows=3, updated_at=cls.MAX_UPDATED_AT) # set up submissions for fabs cls.non_admin_fabs_sub_id = insert_submission( sess, cls.admin_user_id, cgac_code='SYS', start_date='10/2015', end_date='12/2015', is_fabs=True, publish_status_id=PUBLISH_STATUS_DICT['unpublished'], updated_at='01/01/2016') # This is the min date, but the date everything should be using is the one in the job (MAX_UPDATED_AT) cls.admin_fabs_sub_id = insert_submission( sess, cls.other_user_id, cgac_code='000', start_date='10/2015', end_date='12/2015', is_fabs=True, publish_status_id=PUBLISH_STATUS_DICT['unpublished'], updated_at='01/01/2000') cls.published_fabs_sub_id = insert_submission( sess, cls.other_user_id, cgac_code='000', start_date='10/2015', end_date='12/2015', is_fabs=True, publish_status_id=PUBLISH_STATUS_DICT['published'], updated_at='01/02/2000') # Add a job for a FABS submission insert_job(sess, FILE_TYPE_DICT['fabs'], FILE_STATUS_DICT['complete'], JOB_TYPE_DICT['file_upload'], cls.admin_fabs_sub_id, filename=str(cls.admin_fabs_sub_id) + '/test_file.csv', file_size=123, num_rows=3, updated_at=cls.MAX_UPDATED_AT)
def parse_sam_file(file_path, sess, monthly=False, benchmarks=False): parse_start_time = time.time() logger.info("Starting file " + str(file_path)) dat_file_name = os.path.splitext(os.path.basename(file_path))[0]+'.dat' sam_file_type = "MONTHLY" if monthly else "DAILY" dat_file_date = re.findall(".*{}_(.*).dat".format(sam_file_type), dat_file_name)[0] with create_app().app_context(): column_header_mapping = { "awardee_or_recipient_uniqu": 0, "sam_extract_code": 4, "registration_date": 6, "expiration_date": 7, "last_sam_mod_date": 8, "activation_date": 9, "legal_business_name": 10 } column_header_mapping_ordered = OrderedDict(sorted(column_header_mapping.items(), key=lambda c: c[1])) # Initial sweep of the file to see rows and possibly what DUNS we're updating if benchmarks: initial_sweep = time.time() nrows = 0 with zipfile.ZipFile(file_path) as zip_file: with zip_file.open(dat_file_name) as dat_file: nrows = len(dat_file.readlines()) if benchmarks: logger.info("Initial sweep took {} seconds".format(time.time() - initial_sweep)) block_size = 10000 batches = nrows//block_size # skip the first line again if the last batch is also the first batch skiplastrows = 2 if batches == 0 else 1 last_block_size = (nrows % block_size)-skiplastrows batch = 0 added_rows = 0 while batch <= batches: skiprows = 1 if batch == 0 else (batch*block_size) nrows = (((batch+1)*block_size)-skiprows) if (batch < batches) else last_block_size logger.info('Loading rows %s to %s', skiprows+1, nrows+skiprows) with zipfile.ZipFile(file_path) as zip_file: with zip_file.open(dat_file_name) as dat_file: csv_data = pd.read_csv(dat_file, dtype=str, header=None, skiprows=skiprows, nrows=nrows, sep='|', usecols=column_header_mapping_ordered.values(), names=column_header_mapping_ordered.keys(), quoting=3) # add deactivation_date column for delete records lambda_func = (lambda sam_extract: pd.Series([dat_file_date if sam_extract == "1" else np.nan])) csv_data = csv_data.assign(deactivation_date=pd.Series([np.nan], name='deactivation_date') if monthly else csv_data["sam_extract_code"].apply(lambda_func)) # removing rows where DUNS number isn't even provided csv_data = csv_data.where(csv_data["awardee_or_recipient_uniqu"].notnull()) # cleaning and replacing NaN/NaT with None's csv_data = clean_sam_data(csv_data.where(pd.notnull(csv_data), None)) if monthly: logger.info("Adding all monthly data with bulk load") if benchmarks: bulk_month_load = time.time() del csv_data["sam_extract_code"] insert_dataframe(csv_data, DUNS.__table__.name, sess.connection()) if benchmarks: logger.info("Bulk month load took {} seconds".format(time.time()-bulk_month_load)) else: add_data = csv_data[csv_data.sam_extract_code == '2'] update_delete_data = csv_data[(csv_data.sam_extract_code == '3') | (csv_data.sam_extract_code == '1')] for dataframe in [add_data, update_delete_data]: del dataframe["sam_extract_code"] if not add_data.empty: try: logger.info("Attempting to bulk load add data") insert_dataframe(add_data, DUNS.__table__.name, sess.connection()) except IntegrityError: logger.info("Bulk loading add data failed, loading add data by row") sess.rollback() models, activated_models = get_relevant_models(add_data, benchmarks=benchmarks) logger.info("Loading add data ({} rows)".format(len(add_data.index))) load_duns_by_row(add_data, sess, models, activated_models, benchmarks=benchmarks) if not update_delete_data.empty: models, activated_models = get_relevant_models(update_delete_data, benchmarks=benchmarks) logger.info("Loading update_delete data ({} rows)".format(len(update_delete_data.index))) load_duns_by_row(update_delete_data, sess, models, activated_models, benchmarks=benchmarks) sess.commit() added_rows += nrows batch += 1 logger.info('%s DUNS records inserted', added_rows) if benchmarks: logger.info("Parsing {} took {} seconds with {} rows".format(dat_file_name, time.time()-parse_start_time, added_rows))
def load_sql(cls, filename): """Load SQL-based validation rules to db.""" with create_app().app_context(): sess = GlobalDB.db().session # Delete all records currently in table sess.query(RuleSql).delete() filename = os.path.join(cls.sql_rules_path, filename) # open csv with open(filename, 'rU') as csvfile: # read header header = csvfile.readline() # split header into filed names raw_field_names = header.split(',') field_names = [] # clean field names for field in raw_field_names: field_names.append(FieldCleaner.clean_string(field)) unknown_fields = set(field_names) - set(cls.headers) if len(unknown_fields) != 0: raise KeyError("".join(["Found unexpected fields: ", str(list(unknown_fields))])) missing_fields = set(cls.headers) - set(field_names) if len(missing_fields) != 0: raise ValueError("".join(["Missing required fields: ", str(list(missing_fields))])) reader = csv.DictReader(csvfile, fieldnames=field_names) for row in reader: sql = cls.read_sql_str(row['query_name']) rule_sql = RuleSql(rule_sql=sql, rule_label=row['rule_label'], rule_error_message=row['rule_error_message'], query_name=row['query_name']) # look up file type id try: file_id = FILE_TYPE_DICT[row["file_type"]] except Exception as e: raise Exception("{}: file type={}, rule label={}. Rule not loaded.".format( e, row["file_type"], row["rule_label"])) try: if row["target_file"].strip() == "": # No target file provided target_file_id = None else: target_file_id = FILE_TYPE_DICT[row["target_file"]] except Exception as e: raise Exception("{}: file type={}, rule label={}. Rule not loaded.".format( e, row["target_file"], row["rule_label"])) # set cross file flag flag = FieldCleaner.clean_string(row["rule_cross_file_flag"]) if flag in ('true', 't', 'y', 'yes'): cross_file_flag = True else: cross_file_flag = False rule_sql.rule_severity_id = RULE_SEVERITY_DICT[row['severity_name']] rule_sql.file_id = file_id rule_sql.target_file_id = target_file_id rule_sql.rule_cross_file_flag = cross_file_flag sess.merge(rule_sql) sess.commit()
def load_program_activity_data(base_path): """ Load program activity lookup table. Args: base_path: directory of domain config files """ last_upload = get_date_of_current_pa_upload(base_path) if not (last_upload > get_stored_pa_last_upload()): return program_activity_file = get_program_activity_file(base_path) logger.info('Loading program activity: ' + PA_FILE_NAME) with create_app().app_context(): sess = GlobalDB.db().session try: data = pd.read_csv(program_activity_file, dtype=str) except pd.io.common.EmptyDataError as e: log_blank_file() exit_if_nonlocal(4) # exit code chosen arbitrarily, to indicate distinct failure states return headers = set([header.upper() for header in list(data)]) if not VALID_HEADERS.issubset(headers): logger.error("Missing required headers. Required headers include: %s" % str(VALID_HEADERS)) exit_if_nonlocal(4) return try: dropped_count, data = clean_data( data, ProgramActivity, {"fyq": "fiscal_year_quarter", "agency_code": "agency_id", "allocation_id": "allocation_transfer_id", "account_code": "account_number", "pa_code": "program_activity_code", "pa_title": "program_activity_name"}, {"program_activity_code": {"pad_to_length": 4}, "agency_id": {"pad_to_length": 3}, "allocation_transfer_id": {"pad_to_length": 3, "keep_null": True}, "account_number": {"pad_to_length": 4}}, ["agency_id", "program_activity_code", "account_number", "program_activity_name"], True ) except FailureThresholdExceededException as e: if e.count == 0: log_blank_file() exit_if_nonlocal(4) return else: count_str = "Application tried to drop {} rows".format(e.count) logger.error("Loading of program activity file failed due to exceeded failure threshold. " + count_str) exit_if_nonlocal(5) return sess.query(ProgramActivity).delete() # Lowercase Program Activity Name data['program_activity_name'] = data['program_activity_name'].apply(lambda x: lowercase_or_notify(x)) # because we're only loading a subset of program activity info, # there will be duplicate records in the dataframe. this is ok, # but need to de-duped before the db load. We also need to log them. base_count = data.shape[0] data.drop_duplicates(inplace=True) logger.info("Dropped {} duplicate rows.".format(base_count - data.shape[0])) # insert to db table_name = ProgramActivity.__table__.name num = insert_dataframe(data, table_name, sess.connection()) sess.commit() set_stored_pa_last_upload(last_upload) logger.info('{} records inserted to {}'.format(num, table_name)) if dropped_count > 0: exit_if_nonlocal(3) return
def uncache_all_files(): logger.info('Un-caching all generated files') with create_app().app_context(): sess = GlobalDB.db().session sess.query(FileGeneration).update({"is_cached_file": False}, synchronize_session=False) sess.commit()
def load_program_activity_data(base_path, force_reload=False, export=False): """ Load program activity lookup table. Args: base_path: directory of domain config files force_reload: whether or not to force a reload export: whether or not to export a public copy of the file """ now = datetime.datetime.now() metrics_json = { 'script_name': 'load_program_activity.py', 'start_time': str(now), 'records_received': 0, 'duplicates_dropped': 0, 'invalid_records_dropped': 0, 'records_deleted': 0, 'records_inserted': 0 } dropped_count = 0 logger.info('Checking PA upload dates to see if we can skip.') last_upload = get_date_of_current_pa_upload(base_path) if not (last_upload > get_stored_pa_last_upload()) and not force_reload: logger.info('Skipping load as it\'s already been done') else: logger.info('Getting the progrma activity file') program_activity_file = get_program_activity_file(base_path) logger.info('Loading program activity: {}'.format(PA_FILE_NAME)) with create_app().app_context(): sess = GlobalDB.db().session try: raw_data = pd.read_csv(program_activity_file, dtype=str) except pd.io.common.EmptyDataError: log_blank_file() exit_if_nonlocal(4) # exit code chosen arbitrarily, to indicate distinct failure states return headers = set([header.upper() for header in list(raw_data)]) if not VALID_HEADERS.issubset(headers): logger.error('Missing required headers. Required headers include: %s' % str(VALID_HEADERS)) exit_if_nonlocal(4) return try: dropped_count, data = clean_data( raw_data, ProgramActivity, {'fyq': 'fiscal_year_period', 'agency_code': 'agency_id', 'allocation_id': 'allocation_transfer_id', 'account_code': 'account_number', 'pa_code': 'program_activity_code', 'pa_title': 'program_activity_name'}, {'program_activity_code': {'pad_to_length': 4}, 'agency_id': {'pad_to_length': 3}, 'allocation_transfer_id': {'pad_to_length': 3, 'keep_null': True}, 'account_number': {'pad_to_length': 4}}, ['agency_id', 'program_activity_code', 'account_number', 'program_activity_name'], True ) except FailureThresholdExceededException as e: if e.count == 0: log_blank_file() exit_if_nonlocal(4) return else: logger.error('Loading of program activity file failed due to exceeded failure threshold. ' 'Application tried to drop {} rows'.format(e.count)) exit_if_nonlocal(5) return metrics_json['records_deleted'] = sess.query(ProgramActivity).delete() metrics_json['invalid_records_dropped'] = dropped_count # Lowercase Program Activity Name data['program_activity_name'] = data['program_activity_name'].apply(lambda x: lowercase_or_notify(x)) # Convert FYQ to FYP data['fiscal_year_period'] = data['fiscal_year_period'].apply(lambda x: convert_fyq_to_fyp(x)) # because we're only loading a subset of program activity info, there will be duplicate records in the # dataframe. this is ok, but need to de-duped before the db load. We also need to log them. base_count = len(data.index) metrics_json['records_received'] = base_count data.drop_duplicates(inplace=True) dupe_count = base_count - len(data.index) logger.info('Dropped {} duplicate rows.'.format(dupe_count)) metrics_json['duplicates_dropped'] = dupe_count # insert to db table_name = ProgramActivity.__table__.name num = insert_dataframe(data, table_name, sess.connection()) sess.commit() if export: export_public_pa(raw_data) end_time = datetime.datetime.now() update_external_data_load_date(now, end_time, 'program_activity') update_external_data_load_date(last_upload, end_time, 'program_activity_upload') logger.info('{} records inserted to {}'.format(num, table_name)) metrics_json['records_inserted'] = num metrics_json['duration'] = str(end_time - now) with open('load_program_activity_metrics.json', 'w+') as metrics_file: json.dump(metrics_json, metrics_file) if dropped_count > 0: exit_if_nonlocal(3) return
def setup_validation_db(): """Create validation tables from model metadata and do initial inserts.""" with create_app().app_context(): sess = GlobalDB.db().session insert_codes(sess) sess.commit()
def setup_submission_type_db(): """Create job tracker tables from model metadata.""" with create_app().app_context(): sess = GlobalDB.db().session insert_codes(sess) sess.commit()
import argparse import logging from dataactcore.interfaces.db import GlobalDB from dataactcore.logging import configure_logging from dataactcore.utils.parentDuns import sam_config_is_valid, get_duns_batches, update_missing_parent_names from dataactvalidator.health_check import create_app logger = logging.getLogger(__name__) if __name__ == '__main__': configure_logging() with create_app().app_context(): parser = argparse.ArgumentParser(description='Update parent duns columns in DUNS table') parser.add_argument('-b', '--batch_start', help='Batch to start with (type int)', type=int, default=0) parser.add_argument('-e', '--batch_end', help='Batch to end with (type int)', type=int) parser.add_argument('-n', '--parent_name', help='Derives parent name at the end', action='store_true') args = parser.parse_args() # Parse argument to do load on certain update date # Possible option if want to do make sure items load sess = GlobalDB.db().session if args.parent_name: # Derive missing parent names when a parent DUNS number is provided update_missing_parent_names(sess) else:
def load_cfda_program(base_path): """ Load cfda program. Args: base_path: directory that contains the cfda values files. """ if CONFIG_BROKER["use_aws"]: s3_client = boto3.client('s3', region_name=CONFIG_BROKER['aws_region']) filename = s3_client.generate_presigned_url( 'get_object', { 'Bucket': CONFIG_BROKER['sf_133_bucket'], 'Key': "cfda_program.csv" }, ExpiresIn=600) else: filename = os.path.join(base_path, "cfda_program.csv") logger.info('Loading CFDA program file: ' + "cfda_program.csv") """Load country code lookup table.""" model = CFDAProgram with create_app().app_context(): configure_logging() sess = GlobalDB.db().session # for object class, delete and replace values sess.query(model).delete() data = pd.read_csv(filename, dtype=str, encoding='latin1') data = clean_data( data, model, { "program_title": "program_title", "program_number": "program_number", "popular_name_(020)": "popular_name", "federal_agency_(030)": "federal_agency", "authorization_(040)": "authorization", "objectives_(050)": "objectives", "types_of_assistance_(060)": "types_of_assistance", "uses_and_use_restrictions_(070)": "uses_and_use_restrictions", "applicant_eligibility_(081)": "applicant_eligibility", "beneficiary_eligibility_(082)": "beneficiary_eligibility", "credentials/documentation_(083)": "credentials_documentation", "preapplication_coordination_(091)": "preapplication_coordination", "application_procedures_(092)": "application_procedures", "award_procedure_(093)": "award_procedure", "deadlines_(094)": "deadlines", "range_of_approval/disapproval_time_(095)": "range_of_approval_disapproval_time", "appeals_(096)": "appeals", "renewals_(097)": "renewals", "formula_and_matching_requirements_(101)": "formula_and_matching_requirements", "length_and_time_phasing_of_assistance_(102)": "length_and_time_phasing_of_assistance", "reports_(111)": "reports", "audits_(112)": "audits", "records_(113)": "records", "account_identification_(121)": "account_identification", "obligations_(122)": "obligations", "range_and_average_of_financial_assistance_(123)": "range_and_average_of_financial_assistance", "program_accomplishments_(130)": "program_accomplishments", "regulations__guidelines__and_literature_(140)": "regulations_guidelines_and_literature", "regional_or__local_office_(151)": "regional_or_local_office", "headquarters_office_(152)": "headquarters_office", "website_address_(153)": "website_address", "related_programs_(160)": "related_programs", "examples_of_funded_projects_(170)": "examples_of_funded_projects", "criteria_for_selecting_proposals_(180)": "criteria_for_selecting_proposals", "url": "url", "recovery": "recovery", "omb_agency_code": "omb_agency_code", "omb_bureau_code": "omb_bureau_code", "published_date": "published_date", "archived_date": "archived_date" }, {}) data["published_date"] = format_date(data["published_date"]) data["archived_date"] = format_date(data["archived_date"]) # insert to db table_name = model.__table__.name num = insert_dataframe(data, table_name, sess.connection()) sess.commit() logger.info('{} records inserted to {}'.format(num, table_name))
def read_zips(): """ Update zip codes in the zips table. """ with create_app().app_context(): sess = GlobalDB.db().session # Create temporary table to do work in so we don't disrupt the site for too long by altering the actual table sess.execute( 'CREATE TABLE IF NOT EXISTS temp_zips (LIKE zips INCLUDING ALL);') # Truncating in case we didn't clear out this table after a failure in the script sess.execute('TRUNCATE TABLE temp_zips;') sess.commit() if CONFIG_BROKER["use_aws"]: zip_folder = CONFIG_BROKER["zip_folder"] + "/" s3_client = boto3.client('s3', region_name=CONFIG_BROKER['aws_region']) response = s3_client.list_objects_v2( Bucket=CONFIG_BROKER['sf_133_bucket'], Prefix=zip_folder) for obj in response.get('Contents', []): if obj['Key'] != zip_folder: zip_4_file_path = s3_client.generate_presigned_url( 'get_object', { 'Bucket': CONFIG_BROKER['sf_133_bucket'], 'Key': obj['Key'] }, ExpiresIn=600) parse_zip4_file(urllib.request.urlopen(zip_4_file_path), sess) # parse remaining 5 digit zips that weren't in the first file citystate_file = s3_client.generate_presigned_url( 'get_object', { 'Bucket': CONFIG_BROKER['sf_133_bucket'], 'Key': "ctystate.txt" }, ExpiresIn=600) parse_citystate_file(urllib.request.urlopen(citystate_file), sess) census_file = s3_client.generate_presigned_url( 'get_object', { 'Bucket': CONFIG_BROKER['sf_133_bucket'], 'Key': "census_congressional_districts.csv" }, ExpiresIn=600) else: base_path = os.path.join(CONFIG_BROKER["path"], "dataactvalidator", "config", CONFIG_BROKER["zip_folder"]) # creating the list while ignoring hidden files on mac file_list = [ f for f in os.listdir(base_path) if not re.match('^\.', f) ] for file in file_list: parse_zip4_file(open(os.path.join(base_path, file)), sess) # parse remaining 5 digit zips that weren't in the first file citystate_file = os.path.join(CONFIG_BROKER["path"], "dataactvalidator", "config", "ctystate.txt") parse_citystate_file(open(citystate_file), sess) census_file = os.path.join(base_path, "census_congressional_districts.csv") hot_swap_zip_tables(sess) update_state_congr_table_current(sess) update_state_congr_table_census(census_file, sess) logger.info("Zipcode script complete")
def load_cfda_program(base_path, load_local=False, local_file_name="cfda_program.csv"): """ Load cfda program. Args: base_path: directory that contains the cfda values files. """ if not load_local: logger.info("Fetching CFDA file from {}".format(S3_CFDA_FILE)) tmp_name = str(time.time()).replace(".", "") + "_cfda_program.csv" filename = os.path.join(base_path, tmp_name) r = requests.get(S3_CFDA_FILE, allow_redirects=True) open(filename, 'wb').write(r.content) else: filename = os.path.join(base_path, local_file_name) logger.info('Loading CFDA program file: ' + filename) """Load country code lookup table.""" model = CFDAProgram def fix_program_number(n, decimals=3): multiplier = 10 ** decimals value = math.floor(n * multiplier + 0.5) / multiplier return str(value).ljust(6, '0') with create_app().app_context(): configure_logging() sess = GlobalDB.db().session now = datetime.utcnow() import_data = pd.read_csv(filename, dtype=str, encoding='cp1252', na_filter=False) import_data = clean_data( import_data, model, DATA_CLEANING_MAP, {} ) import_data["published_date"] = format_date(import_data["published_date"]) import_data["archived_date"] = format_date(import_data["archived_date"]) import_dataframe = import_data.copy(deep=True) # To do the comparison, first we need to mock the pk column that postgres creates. We'll set it universally to 1 import_dataframe = import_dataframe.assign(cfda_program_id=1, created_at=now, updated_at=now) table_name = model.__table__.name current_data = pd.read_sql_table(table_name, sess.connection(), coerce_float=False) # Now we need to overwrite the db's audit dates in the created dataframe, and # also set all the pks to 1, so they match current_data = current_data.assign(cfda_program_id=1, created_at=now, updated_at=now) # pandas comparison requires everything to be in the same order current_data.sort_values('program_number', inplace=True) import_dataframe.sort_values('program_number', inplace=True) # columns too cols = import_dataframe.columns.tolist() cols.sort() import_dataframe = import_dataframe[cols] cols = current_data.columns.tolist() cols.sort() current_data = current_data[cols] # need to reset the indexes now that we've done all this sorting, so that they match import_dataframe.reset_index(drop=True, inplace=True) current_data.reset_index(drop=True, inplace=True) # My favorite part: When pandas pulls the data out of postgres, the program_number column # is a Decimal. However, in adding it to the dataframe, this column loses precision. # So for example, a program number of 10.001 imports into the dataframe as 10.000999999999999. # It also needs to be cast to astring, and padded with the right number of zeroes, as needed. current_data['program_number'] = current_data['program_number'].apply(lambda x: fix_program_number(x)) # Finally, you can execute this and get True back if the data truly has not changed from the last # time the CSV was loaded. new_data = not import_dataframe.equals(current_data) if new_data: # insert to db sess.query(model).delete() num = insert_dataframe(import_data, table_name, sess.connection()) sess.commit() if not load_local: os.remove(filename) if new_data: logger.info('{} records inserted to {}'.format(num, table_name)) else: logger.info("Skipped cfda load, no new data.") sys.exit(3)
def setup_job_tracker_db(): """Create job tracker tables from model metadata.""" with create_app().app_context(): sess = GlobalDB.db().session insert_codes(sess) sess.commit()
def setup_validation_db(): """Create validation tables from model metadata and do initial inserts.""" with create_app().app_context(): sess = GlobalDB.db().session insert_codes(sess) sess.commit()