def test_file_checking(self): ''' Various quality assurance checks on the DataReader's validation functions - include_flag in manifest used properly to determine loading or skipping - CSV file being loaded has the expected number of fields as found in meta.json - convenience function should_file_be_loaded performs properly ''' META_FILENAME = os.path.join(os.path.dirname(__file__), 'test_data/meta_sample.json') meta = load_meta_data(META_FILENAME) table_name = 'buildings' MANIFEST_FILENAME = os.path.join(os.path.dirname(__file__), 'test_data/manifest_sample.csv') manifest = ManifestReader(MANIFEST_FILENAME) for manifest_row in manifest: print(manifest_row) csv_reader = DataReader(meta=meta, manifest_row=manifest_row) assert (csv_reader.do_fields_match()) sql_manifest_row = manifest_row #assume they exactly match if manifest_row['include_flag'] == "skip": self.assertFalse( csv_reader.check_include_flag(sql_manifest_row)) self.assertFalse( csv_reader.should_file_be_loaded(sql_manifest_row)) elif manifest_row['include_flag'] == "use": self.assertTrue( csv_reader.check_include_flag(sql_manifest_row)) self.assertTrue( csv_reader.should_file_be_loaded(sql_manifest_row))
def test_file_checking(self): ''' Various quality assurance checks on the DataReader's validation functions - include_flag in manifest used properly to determine loading or skipping - CSV file being loaded has the expected number of fields as found in meta.json - convenience function should_file_be_loaded performs properly ''' META_FILENAME = os.path.join(os.path.dirname(__file__), 'test_data/meta_sample.json') meta = load_meta_data(META_FILENAME) table_name = 'buildings' MANIFEST_FILENAME = os.path.join(os.path.dirname(__file__), 'test_data/manifest_sample.csv') manifest = ManifestReader(MANIFEST_FILENAME) for manifest_row in manifest: print(manifest_row) csv_reader = DataReader(meta = meta, manifest_row=manifest_row) assert(csv_reader._do_fields_match()) #assume they exactly match for the purpose of this test, expect as modified below sql_manifest_row = manifest_row #sql_manifest_row has an extra field to indicate whether it has been successfully loaded in previous runs or not sql_manifest_row['status'] = 'failed' if manifest_row['include_flag'] == "skip": self.assertFalse(csv_reader._check_include_flag(sql_manifest_row)) self.assertFalse(csv_reader.should_file_be_loaded(sql_manifest_row)) elif manifest_row['include_flag'] == "use": self.assertTrue(csv_reader._check_include_flag(sql_manifest_row)) self.assertTrue(csv_reader.should_file_be_loaded(sql_manifest_row)) #If the file is already in the SQL manifest, skip it when manifest says 'use' (i.e. it's already loaded) sql_manifest_row['status'] = 'loaded' if manifest_row['include_flag'] == "use": self.assertFalse(csv_reader._check_include_flag(sql_manifest_row))
def make_all_json(manifest_path): completed_tables = {} manifest = ManifestReader(path=manifest_path) if manifest.has_unique_ids(): for manifest_row in manifest: tablename = manifest_row['destination_table'] if tablename not in completed_tables: if manifest_row['include_flag'] == 'use': filepath = os.path.abspath( os.path.join(manifest_row['local_folder'], manifest_row['filepath'])) make_draft_json(filepath, tablename) completed_tables[tablename] = filepath print("completed {}".format(tablename)) else: print("skipping {}, already loaded".format( manifest_row['unique_data_id'])) print("Finished! Used these files to generate json:") print(completed_tables)
def test_unique_manifest_ids(self): '''Make sure the 'unique_data_id' column in the manifest is unique to each row''' #Example with all unique values MANIFEST_FILENAME = os.path.join(os.path.dirname(__file__), 'test_data/manifest_unique.csv') manifest1 = ManifestReader(MANIFEST_FILENAME) self.assertTrue(manifest1.has_unique_ids()) #Example with a duplicated name MANIFEST_FILENAME = os.path.join(os.path.dirname(__file__), 'test_data/manifest_duplicates.csv') manifest2 = ManifestReader(MANIFEST_FILENAME) self.assertFalse(manifest2.has_unique_ids())
def main(database_choice, meta_path, manifest_path, keep_temp_files=True): """ Main routine, calls all the other ones in this file as needed. Big picture: - Use manifest.csv to find out all the files we want to load - Compare current manifest.csv to the sql database manifest table, which reflects manifest.csv as of the last time this script was run, and tells us whether or not the file has already been loaded in this database. - Use meta.json to make sure the CSV file has all the fields we expect it to, and to decide the data type of each field. - Load the csv into the database If there is an error loading a file (either flagged error from fields not matching, or parsing error if data type isn't right): - skip loading this file, - report the error to SQL using update_sql_manifest(status="error") - use logging.warning() to write the specific error encountered to the log file - at the end of loading print an error message to the console """ # load given meta.json and manifest.csv files into memory meta = ingestionfunctions.load_meta_data(meta_path) manifest = ManifestReader(manifest_path) # load given database choice and check/create sql manifest engine = dbtools.get_database_engine(database_choice) sql_manifest_exists = \ ingestionfunctions.check_or_create_sql_manifest(engine=engine) logging.info("sql_manifest_exists: {}".format(sql_manifest_exists)) #TODO should this be moved into the __init__ of ManifestReader? Do we ever want to use ManifestReader if it has duplicate rows? if not manifest.has_unique_ids(): raise ValueError('Manifest has duplicate unique_data_id!') # Iterate through each row in the manifest then clean and validate. If it # passes validation, then write to temp psv file for loading into database. for manifest_row in manifest: #Incompletely filled out rows in the manifest can break the other code # TODO: figure out a way to flag this issue early in loading of manifest # only clean and validate data files flagged for use in database if manifest_row['include_flag'] == 'use': logging.info( "{}: preparing to load row {} from the manifest".format( manifest_row['unique_data_id'], len(manifest))) temp_filepath = os.path.abspath( os.path.join( logging_path, 'temp_{}.psv'.format(manifest_row['unique_data_id']))) # prepare csv reader and writer objects csv_reader = DataReader(meta=meta, manifest_row=manifest_row, load_from="file") csv_writer = CSVWriter(meta=meta, manifest_row=manifest_row, filename=temp_filepath) # prepare objects for interfacing with database and then get # the equivalent manifest row from the database sql_interface = HISql(meta=meta, manifest_row=manifest_row, engine=engine, filename=temp_filepath) sql_manifest_row = sql_interface.get_sql_manifest_row() #Assign an appropriate testing cleaner tablename = manifest_row['destination_table'] cleaner_class_name = meta[tablename]['cleaner'] cleaner = ingestionfunctions.get_cleaner_from_name( meta=meta, manifest_row=manifest_row, name=cleaner_class_name) # Identify fields that exist in meta.json but not CSV # so we can add them to the row as it is cleaned and loaded. meta_only_fields = {} for field in meta[tablename]['fields']: if field['source_name'] not in csv_reader.keys: # adds 'sql_name',None as key,value pairs in dict meta_only_fields[field['sql_name']] = None #clean the file and save the output to a local pipe-delimited file if csv_reader.should_file_be_loaded( sql_manifest_row=sql_manifest_row): print(" Cleaning...") for idx, data_row in enumerate(csv_reader): data_row.update( meta_only_fields) # insert other field dict clean_data_row = cleaner.clean(data_row, idx) if clean_data_row != None: csv_writer.write(clean_data_row) csv_writer.close() print(" Loading...") #Decide whether to append or replace the table if meta[tablename]["replace_table"] == True: logging.info(" replacing existing table") sql_interface.drop_table() #Appends to table; if dropped, it recreates sql_interface.create_table_if_necessary() try: sql_interface.write_file_to_sql() except TableWritingError: #TODO tell user total count of errors. #currently write_file_to_sql() just writes in log that file failed pass if keep_temp_files == False: csv_writer.remove_file()
# SETUP - Mostly taken from load_data.py this_file = os.path.realpath(__file__) this_dir = os.path.dirname(this_file) meta_path = os.path.abspath( this_dir + '/test_data/meta_sample.json') # TODO Update when meta.json is updated. manifest_path = os.path.abspath( this_dir + '/test_data/manifest_sample.csv') # TODO Consider updating manifest_sample database_choice = 'docker_database' logging_path = os.path.abspath("../logs") meta = ingestionfunctions.load_meta_data(meta_path) engine = dbtools.get_database_engine(database_choice) manifest = ManifestReader(manifest_path) FAKE_REQUIRED_FIELD = { 'sql_name': 'foo', 'type': 'decimal', 'source_name': 'FAKE_REQUIRED', 'display_text': 'Blah', 'display_name': 'x', 'required_in_source': True } FAKE_OPTIONAL_FIELD = { 'sql_name': 'foo', 'type': 'decimal', 'source_name': 'FAKE_OPTIONAL', 'display_text': 'Blah', 'display_name': 'x',
def main(database_choice, meta_path, manifest_path, keep_temp_files = True): """ Main routine, calls all the other ones in this file as needed. Big picture: - Use manifest.csv to find out all the files we want to load - Compare current manifest.csv to the sql database manifest table, which reflects manifest.csv as of the last time this script was run, and tells us whether or not the file has already been loaded in this database. - Use meta.json to make sure the CSV file has all the fields we expect it to, and to decide the data type of each field - Load the csv into the database If there is an error loading a file (either flagged error from fields not matching, or parsing error if data type isn't right): - skip loading this file, - report the error to SQL using update_sql_manifest(status="error") - use logging.warning() to write the specific error encountered to the log file - at the end of loading print an error message to the console """ meta = ingestionfunctions.load_meta_data(meta_path) engine = dbtools.get_database_engine(database_choice) manifest = ManifestReader(manifest_path) sql_manifest_exists = ingestionfunctions.check_or_create_sql_manifest(engine=engine) logging.info("sql_manifest_exists: {}".format(sql_manifest_exists)) #TODO should this be moved into the __init__ of ManifestReader? Do we ever want to use ManifestReader if it has duplicate rows? if not manifest.has_unique_ids(): raise ValueError('Manifest has duplicate unique_data_id!') for manifest_row in manifest: #Incompletely filled out rows in the manifest can break the other code if manifest_row['include_flag'] == 'use': logging.info("{}: preparing to load row {} from the manifest".format(manifest_row['unique_data_id'],len(manifest))) temp_filepath = os.path.abspath( os.path.join( logging_path, 'temp_{}.psv'.format(manifest_row['unique_data_id']) )) csv_reader = DataReader(meta = meta, manifest_row=manifest_row, load_from="file") csv_writer = CSVWriter(meta = meta, manifest_row = manifest_row, filename = temp_filepath) sql_interface = HISql(meta = meta, manifest_row = manifest_row, engine = engine, filename=temp_filepath) sql_manifest_row = sql_interface.get_sql_manifest_row() #Assign an appropriate testing cleaner tablename = manifest_row['destination_table'] cleaner_class_name = meta[tablename]['cleaner'] cleaner = ingestionfunctions.get_cleaner_from_name( meta=meta, manifest_row=manifest_row, name= cleaner_class_name) #clean the file and save the output to a local pipe-delimited file if csv_reader.should_file_be_loaded(sql_manifest_row=sql_manifest_row): print(" Ready to clean {}".format(csv_reader.destination_table)) for idx, data_row in enumerate(csv_reader): clean_data_row = cleaner.clean(data_row, idx) if clean_data_row != None: csv_writer.write(clean_data_row) csv_writer.close() print(" Ready to load") #Decide whether to append or replace the table if meta[tablename]["replace_table"] == True: logging.info(" replacing existing table") sql_interface.drop_table() #Appends to table; if dropped, it recreates sql_interface.create_table() try: sql_interface.write_file_to_sql() except TableWritingError: #TODO tell user total count of errors. #currently write_file_to_sql() just writes in log that file failed pass if keep_temp_files == False: csv_writer.remove_file()