示例#1
0
logging.basicConfig(filename=filepaths["log_filepath"]+filepaths['log_filename'], format='%(asctime)s %(message)s', 
					datefmt='%m/%d/%Y %I:%M:%S %p', filemode=filepaths['log_mode'], level=logging.INFO)

geographic_data = pd.read_csv(geo_config['geographic_data_file'], delimiter='|', header=0,
	names=geo_config['file_columns'], dtype=object) #instantiate Census file data as dataframe

#create 11 digit Census Tract codes from 5 digit county and 6 digit tract
geographic_data['county_fips'] = geographic_data.apply(lambda x: str(x.state_code) + str(x.county), axis=1)
geographic_data["tract_fips"] = geographic_data.apply(lambda x: str(x.county_fips) + str(x.tracts), axis=1)

with open(geo_config["zip_code_file"], 'r') as f:
	zip_codes = json.load(f)
zip_codes.append("Exempt")

#instantiate lar generator to create random LAR and fixed TS data
lar_gen = lar_generator.lar_gen(lar_schema_file=lar_schema_file, ts_schema_file=ts_schema_file)

#set lar_file_config lei to match bank config data
lar_file_config_data["lei"]["value"] = bank_config_data["lei"]["value"]
#instantiate rules engine to check conformity of synthetic data to FIG schema
rules_engine = rules_engine(config_data=lar_file_config_data, state_codes=geo_config["state_codes"], state_codes_rev=geo_config["state_codes_rev"],
	geographic_data=geographic_data, full_lar_file_check=False)

#instantiate constraints logic to force LAR data to conform to FIG schema
lar_constraints = lar_data_constraints(lar_file_config=lar_file_config_data, geographic_data=geographic_data)

#store original row for diff comparison to see what elements are being changed

ts_row = lar_gen.make_ts_row(bank_file_config=bank_config_data) #create TS row, we only need one
ts_df = pd.DataFrame(ts_row, index=[0])
rules_engine.load_ts_data(ts_df) #loading ts_row to rules_engine converts it to a dataframe for value checking
	def __init__(self):

		#Loads the filepath configuration. 
		with open('configurations/test_filepaths.yaml') as f:
			#Uses safe_load instead of load.
			self.filepaths = yaml.safe_load(f)

		#Loads the geographic file configuration.
		with open('configurations/geographic_data.yaml') as f:
			# Uses safe_load instead of load.
			self.geographic = yaml.safe_load(f) 

		#Loads the clean file configuration. 
		with open('configurations/clean_file_config.yaml') as f:
			# Uses safe_load instead of load.
			self.data_map = yaml.safe_load(f)

		#Stores the column names for the file containing geographic crosswalk data. 
		file_cols = self.geographic['file_columns']

		#Sets the logging parameters.
		#Uses a log file name and file writing mode from the
		#test_filepaths yaml.  
		logging.basicConfig(filename=self.filepaths['log_filename'],
					format='%(asctime)s %(message)s',
                	datefmt='%m/%d/%Y %I:%M:%S %p',
                	filemode=self.filepaths['log_mode'],
                	level=logging.INFO)

		#Loads geographic crosswalk data from filepaths named in the test_filepaths
		#yaml file. 
		self.crosswalk_data = pd.read_csv(self.geographic['crosswalk_data_file'], 
			delimiter='|', header=None, names=file_cols, dtype=str)

		#Creates county, tract, and small county data from the file containing geographic crosswalk data. 
		self.crosswalk_data['countyFips'] = self.crosswalk_data['stateCode'] + self.crosswalk_data['county']
		self.crosswalk_data["tractFips"] = self.crosswalk_data.countyFips + self.crosswalk_data.tracts
		self.counties = list(self.crosswalk_data.countyFips)
		self.tracts = list(self.crosswalk_data.tractFips)
		self.small_counties = list(self.crosswalk_data.countyFips[self.crosswalk_data.smallCounty=="S"])

		#Loads schemas for LAR and TS.
		#Schemas contain valid enumerations, including NA values, for each 
		#field in the dataset. 
		self.lar_schema_df = pd.DataFrame(json.load(open(
			self.filepaths['lar_schema_json'], "r")))
		self.ts_schema_df = pd.DataFrame(json.load(open(
			self.filepaths['ts_schema_json'], "r")))

		#Instantiates the other classes. 
		
		#lar_gen is responsible for generating data according to the values
		# in the schema.
		self.lar_gen = lar_generator.lar_gen(self.lar_schema_df, 
			self.ts_schema_df, counties=self.counties, tracts=self.tracts)

		#lar_constrains is responsible for modifying generated data so that 
		#the resulting file passes syntax and validity edits.
		self.lar_const = lar_constraints.lar_constraints(counties=self.counties, 
			tracts=self.tracts) 

		#lar_validator checks a dataframe and returns a JSON with 
		#edit pass/fail results. 
		self.lar_validator = rules_engine(lar_schema=self.lar_schema_df, 
			ts_schema=self.ts_schema_df, crosswalk_data=self.crosswalk_data)
					#tracts=tracts, counties=counties, small_counties=small_counties) 

		#Stores the number of rows in the test file
		self.file_length = self.data_map["file_length"]["value"] 

		#Stores the LEI for the test file. 
		self.lei = self.data_map["lei"]["value"]
    def __init__(self,
                 file_paths_config='configurations/test_filepaths.yaml',
                 geo_config_file='configurations/geographic_data.yaml',
                 clean_file_config='configurations/clean_file_config.yaml',
                 edit_report_config='configurations/edit_report_config.yaml'):

        print("initializing file generator")
        #Loads the filepath configuration.
        print("opening config files")
        with open(file_paths_config) as f:
            #Uses safe_load instead of load.
            self.filepaths = yaml.safe_load(f)

        #Loads the geographic file configuration.
        with open(geo_config_file) as f:
            # Uses safe_load instead of load.
            self.geographic = yaml.safe_load(f)

        with open(self.geographic["geographic_data_file"]) as f:
            self.geo_data = yaml.safe_load(f)

        #Loads the clean file configuration.
        with open(clean_file_config) as f:
            # Uses safe_load instead of load.
            self.clean_config = yaml.safe_load(f)

        self.geo_config_file = geo_config_file  #set geo config file as class variable to instantiate other classes

        #Loads the edit report configuration.
        with open(edit_report_config) as f:
            # Uses safe_load instead of load.
            self.edit_report_config = yaml.safe_load(f)
        print("config files loaded")
        #Sets the logging parameters.
        #Uses a log file name and file writing mode from the
        #test_filepaths yaml.
        print("configuring logging in file Generator")
        logging.basicConfig(filename=self.filepaths['log_filename'],
                            format='%(asctime)s %(message)s',
                            datefmt='%m/%d/%Y %I:%M:%S %p',
                            filemode=self.filepaths['log_mode'],
                            level=logging.INFO)
        print("logging configured")
        #Loads geographic geographic data from filepaths named in the test_filepaths
        #yaml file.
        print("loading geo data to file generator")
        self.geographic_data = pd.read_csv(
            self.geographic['geographic_data_file'],
            delimiter='|',
            header=None,
            names=self.geographic['file_columns'],
            dtype=object)
        #create 5 digit County Codes from 2 digit state and 3 digit county
        self.geographic_data['county_fips'] = self.geographic_data.apply(
            lambda x: str(x.state_code) + str(x.county), axis=1)
        #create 11 digit Census Tract codes from 5 digit county and 6 digit tract
        self.geographic_data["tract_fips"] = self.geographic_data.apply(
            lambda x: str(x.county_fips) + str(x.tracts), axis=1)
        self.small_counties = list(self.geographic_data.county_fips[
            self.geographic_data.small_county == "S"])
        print("geo data loaded to file generator")
        #Loads schemas for LAR and TS.
        #Schemas contain valid enumerations, including NA values, for each field in the dataset.
        print("loading JSON schema files")
        self.lar_schema_df = pd.DataFrame(
            json.load(open(self.filepaths['lar_schema_json'], "r")))
        self.ts_schema_df = pd.DataFrame(
            json.load(open(self.filepaths['ts_schema_json'], "r")))

        #Instantiates the other classes.

        #lar_gen is responsible for generating data according to the values in the schema.
        print("instantiating class objects")
        print("lar gen loading")
        self.lar_gen = lar_generator.lar_gen()
        print("lar gen done")
        #lar_constrains is responsible for modifying generated data so that
        #the resulting file passes syntax and validity edits.
        print("lar constraints loading")
        self.lar_const = lar_constraints.lar_constraints()
        print("lar constraints loaded")
        self.constraints = []
        for func in dir(self.lar_const):
            if func[:1] in ("s", "v") and func[1:4].isdigit() == True:
                self.constraints.append(func)
        #lar_validator checks a dataframe and returns a JSON with
        #edit pass/fail results.
        print("rules engine loading")
        self.lar_validator = rules_engine(geo_config_file=self.geo_config_file)
        #tracts=tracts, counties=counties, small_counties=small_counties)

        print("rules engine loaded")
        print("file generator initialization complete")
                    names=cbsa_cols,
                    dtype=str)  #load tract to CBSA data from platform file
cbsas["tractFips"] = cbsas.countyFips + cbsas.tracts
counties = list(cbsas.countyFips)
tracts = list(cbsas.tractFips)
small_counties = list(cbsas.countyFips[cbsas.smallCounty == "1"])

#load schemas for LAR and transmittal sheet
#schemas contain valid enumerations, including NA values, for each field in the dataset
lar_schema_df = pd.DataFrame(json.load(open("../schemas/lar_schema.json",
                                            "r")))
ts_schema_df = pd.DataFrame(json.load(open("../schemas/ts_schema.json", "r")))

#instantiate class objects
lar_gen = lar_generator.lar_gen(
    lar_schema_df, ts_schema_df, counties=counties, tracts=tracts
)  #lar gen is responsible for generating data according to the schema
lar_const = lar_constraints.lar_constraints(
    counties=counties, tracts=tracts
)  #lar constrains is responsible for modifying generated data so that the resulting file passes edits
lar_validator = rules_engine(lar_schema=lar_schema_df,
                             ts_schema=ts_schema_df,
                             cbsa_data=cbsas)
#tracts=tracts, counties=counties, small_counties=small_counties) #lar validator checks a dataframe and returns a JSON with generate_error_files

#Set parameters for data creation
file_length = data_map["file_length"][
    "value"]  #set number of rows in test file
lei = data_map["lei"][
    "value"]  #None #Flag for presence of an LEI. Only a single LEI should be used for a file, so if one is present, it will be used.
first = True  #flag for first row of data. The first row is used to create the dataframe, subsequent rows are appended