logging.basicConfig(filename=filepaths["log_filepath"]+filepaths['log_filename'], format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p', filemode=filepaths['log_mode'], level=logging.INFO) geographic_data = pd.read_csv(geo_config['geographic_data_file'], delimiter='|', header=0, names=geo_config['file_columns'], dtype=object) #instantiate Census file data as dataframe #create 11 digit Census Tract codes from 5 digit county and 6 digit tract geographic_data['county_fips'] = geographic_data.apply(lambda x: str(x.state_code) + str(x.county), axis=1) geographic_data["tract_fips"] = geographic_data.apply(lambda x: str(x.county_fips) + str(x.tracts), axis=1) with open(geo_config["zip_code_file"], 'r') as f: zip_codes = json.load(f) zip_codes.append("Exempt") #instantiate lar generator to create random LAR and fixed TS data lar_gen = lar_generator.lar_gen(lar_schema_file=lar_schema_file, ts_schema_file=ts_schema_file) #set lar_file_config lei to match bank config data lar_file_config_data["lei"]["value"] = bank_config_data["lei"]["value"] #instantiate rules engine to check conformity of synthetic data to FIG schema rules_engine = rules_engine(config_data=lar_file_config_data, state_codes=geo_config["state_codes"], state_codes_rev=geo_config["state_codes_rev"], geographic_data=geographic_data, full_lar_file_check=False) #instantiate constraints logic to force LAR data to conform to FIG schema lar_constraints = lar_data_constraints(lar_file_config=lar_file_config_data, geographic_data=geographic_data) #store original row for diff comparison to see what elements are being changed ts_row = lar_gen.make_ts_row(bank_file_config=bank_config_data) #create TS row, we only need one ts_df = pd.DataFrame(ts_row, index=[0]) rules_engine.load_ts_data(ts_df) #loading ts_row to rules_engine converts it to a dataframe for value checking
def __init__(self): #Loads the filepath configuration. with open('configurations/test_filepaths.yaml') as f: #Uses safe_load instead of load. self.filepaths = yaml.safe_load(f) #Loads the geographic file configuration. with open('configurations/geographic_data.yaml') as f: # Uses safe_load instead of load. self.geographic = yaml.safe_load(f) #Loads the clean file configuration. with open('configurations/clean_file_config.yaml') as f: # Uses safe_load instead of load. self.data_map = yaml.safe_load(f) #Stores the column names for the file containing geographic crosswalk data. file_cols = self.geographic['file_columns'] #Sets the logging parameters. #Uses a log file name and file writing mode from the #test_filepaths yaml. logging.basicConfig(filename=self.filepaths['log_filename'], format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p', filemode=self.filepaths['log_mode'], level=logging.INFO) #Loads geographic crosswalk data from filepaths named in the test_filepaths #yaml file. self.crosswalk_data = pd.read_csv(self.geographic['crosswalk_data_file'], delimiter='|', header=None, names=file_cols, dtype=str) #Creates county, tract, and small county data from the file containing geographic crosswalk data. self.crosswalk_data['countyFips'] = self.crosswalk_data['stateCode'] + self.crosswalk_data['county'] self.crosswalk_data["tractFips"] = self.crosswalk_data.countyFips + self.crosswalk_data.tracts self.counties = list(self.crosswalk_data.countyFips) self.tracts = list(self.crosswalk_data.tractFips) self.small_counties = list(self.crosswalk_data.countyFips[self.crosswalk_data.smallCounty=="S"]) #Loads schemas for LAR and TS. #Schemas contain valid enumerations, including NA values, for each #field in the dataset. self.lar_schema_df = pd.DataFrame(json.load(open( self.filepaths['lar_schema_json'], "r"))) self.ts_schema_df = pd.DataFrame(json.load(open( self.filepaths['ts_schema_json'], "r"))) #Instantiates the other classes. #lar_gen is responsible for generating data according to the values # in the schema. self.lar_gen = lar_generator.lar_gen(self.lar_schema_df, self.ts_schema_df, counties=self.counties, tracts=self.tracts) #lar_constrains is responsible for modifying generated data so that #the resulting file passes syntax and validity edits. self.lar_const = lar_constraints.lar_constraints(counties=self.counties, tracts=self.tracts) #lar_validator checks a dataframe and returns a JSON with #edit pass/fail results. self.lar_validator = rules_engine(lar_schema=self.lar_schema_df, ts_schema=self.ts_schema_df, crosswalk_data=self.crosswalk_data) #tracts=tracts, counties=counties, small_counties=small_counties) #Stores the number of rows in the test file self.file_length = self.data_map["file_length"]["value"] #Stores the LEI for the test file. self.lei = self.data_map["lei"]["value"]
def __init__(self, file_paths_config='configurations/test_filepaths.yaml', geo_config_file='configurations/geographic_data.yaml', clean_file_config='configurations/clean_file_config.yaml', edit_report_config='configurations/edit_report_config.yaml'): print("initializing file generator") #Loads the filepath configuration. print("opening config files") with open(file_paths_config) as f: #Uses safe_load instead of load. self.filepaths = yaml.safe_load(f) #Loads the geographic file configuration. with open(geo_config_file) as f: # Uses safe_load instead of load. self.geographic = yaml.safe_load(f) with open(self.geographic["geographic_data_file"]) as f: self.geo_data = yaml.safe_load(f) #Loads the clean file configuration. with open(clean_file_config) as f: # Uses safe_load instead of load. self.clean_config = yaml.safe_load(f) self.geo_config_file = geo_config_file #set geo config file as class variable to instantiate other classes #Loads the edit report configuration. with open(edit_report_config) as f: # Uses safe_load instead of load. self.edit_report_config = yaml.safe_load(f) print("config files loaded") #Sets the logging parameters. #Uses a log file name and file writing mode from the #test_filepaths yaml. print("configuring logging in file Generator") logging.basicConfig(filename=self.filepaths['log_filename'], format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p', filemode=self.filepaths['log_mode'], level=logging.INFO) print("logging configured") #Loads geographic geographic data from filepaths named in the test_filepaths #yaml file. print("loading geo data to file generator") self.geographic_data = pd.read_csv( self.geographic['geographic_data_file'], delimiter='|', header=None, names=self.geographic['file_columns'], dtype=object) #create 5 digit County Codes from 2 digit state and 3 digit county self.geographic_data['county_fips'] = self.geographic_data.apply( lambda x: str(x.state_code) + str(x.county), axis=1) #create 11 digit Census Tract codes from 5 digit county and 6 digit tract self.geographic_data["tract_fips"] = self.geographic_data.apply( lambda x: str(x.county_fips) + str(x.tracts), axis=1) self.small_counties = list(self.geographic_data.county_fips[ self.geographic_data.small_county == "S"]) print("geo data loaded to file generator") #Loads schemas for LAR and TS. #Schemas contain valid enumerations, including NA values, for each field in the dataset. print("loading JSON schema files") self.lar_schema_df = pd.DataFrame( json.load(open(self.filepaths['lar_schema_json'], "r"))) self.ts_schema_df = pd.DataFrame( json.load(open(self.filepaths['ts_schema_json'], "r"))) #Instantiates the other classes. #lar_gen is responsible for generating data according to the values in the schema. print("instantiating class objects") print("lar gen loading") self.lar_gen = lar_generator.lar_gen() print("lar gen done") #lar_constrains is responsible for modifying generated data so that #the resulting file passes syntax and validity edits. print("lar constraints loading") self.lar_const = lar_constraints.lar_constraints() print("lar constraints loaded") self.constraints = [] for func in dir(self.lar_const): if func[:1] in ("s", "v") and func[1:4].isdigit() == True: self.constraints.append(func) #lar_validator checks a dataframe and returns a JSON with #edit pass/fail results. print("rules engine loading") self.lar_validator = rules_engine(geo_config_file=self.geo_config_file) #tracts=tracts, counties=counties, small_counties=small_counties) print("rules engine loaded") print("file generator initialization complete")
names=cbsa_cols, dtype=str) #load tract to CBSA data from platform file cbsas["tractFips"] = cbsas.countyFips + cbsas.tracts counties = list(cbsas.countyFips) tracts = list(cbsas.tractFips) small_counties = list(cbsas.countyFips[cbsas.smallCounty == "1"]) #load schemas for LAR and transmittal sheet #schemas contain valid enumerations, including NA values, for each field in the dataset lar_schema_df = pd.DataFrame(json.load(open("../schemas/lar_schema.json", "r"))) ts_schema_df = pd.DataFrame(json.load(open("../schemas/ts_schema.json", "r"))) #instantiate class objects lar_gen = lar_generator.lar_gen( lar_schema_df, ts_schema_df, counties=counties, tracts=tracts ) #lar gen is responsible for generating data according to the schema lar_const = lar_constraints.lar_constraints( counties=counties, tracts=tracts ) #lar constrains is responsible for modifying generated data so that the resulting file passes edits lar_validator = rules_engine(lar_schema=lar_schema_df, ts_schema=ts_schema_df, cbsa_data=cbsas) #tracts=tracts, counties=counties, small_counties=small_counties) #lar validator checks a dataframe and returns a JSON with generate_error_files #Set parameters for data creation file_length = data_map["file_length"][ "value"] #set number of rows in test file lei = data_map["lei"][ "value"] #None #Flag for presence of an LEI. Only a single LEI should be used for a file, so if one is present, it will be used. first = True #flag for first row of data. The first row is used to create the dataframe, subsequent rows are appended