def test_add_to_import_history_second_file(db_transact): helper_create_import_history_table(db_transact) f = NamedTemporaryFile(suffix=".csv") new_id = dbu.add_to_import_history( filepath=pathlib.Path(f.name), db_connection=db_transact, filetype="test1" ) assert new_id == 1 f2 = NamedTemporaryFile(suffix=".csv") f2.write(b"Hello, World") # Rewind the file head pointer to the beginning, to simulate reading the # file fresh. f2.seek(0) new_id = dbu.add_to_import_history( filepath=pathlib.Path(f2.name), db_connection=db_transact, filetype="test2" ) assert new_id == 2 import_history = pd.read_sql( sql=f""" SELECT * from {dbu.IMPORT_HISTORY_SCHEMA}.{dbu.IMPORT_HISTORY_TABLE} """, con=db_transact ) npt.assert_array_equal( import_history.columns, import_history_cols ) npt.assert_array_equal( import_history, pd.DataFrame([ [ 1, # id datetime(1970, 1, 2, 3, 4, 5), 'test1', # filetype pathlib.Path(f.name).name, # filename # d41d8cd98f00b204e9800998ecf8427e = the md5 of an empty file 'd41d8cd98f00b204e9800998ecf8427e' # filehash ], [ 2, # id datetime(1970, 1, 2, 3, 4, 20), # add 15 sec for tick() 'test2', # filetype pathlib.Path(f2.name).name, # filename '82bb413746aee42f89dea2b59614f9ef' # filehash= b"Hello, World" ], ]) )
def test_add_to_import_history_simple(db_transact): helper_create_import_history_table(db_transact) f = NamedTemporaryFile(suffix=".csv") new_id = dbu.add_to_import_history( filepath=pathlib.Path(f.name), db_connection=db_transact, filetype="test1" ) import_history = pd.read_sql( sql=f""" SELECT * from {dbu.IMPORT_HISTORY_SCHEMA}.{dbu.IMPORT_HISTORY_TABLE} """, con=db_transact ) assert new_id == 1 npt.assert_array_equal( import_history.columns, import_history_cols ) npt.assert_array_equal( import_history, pd.DataFrame([ [ 1, # id datetime(1970, 1, 2, 3, 4, 5), 'test1', # filetype pathlib.Path(f.name).name, # filename # d41d8cd98f00b204e9800998ecf8427e = the md5 of an empty file 'd41d8cd98f00b204e9800998ecf8427e' # filehash ], ]) )
def test_add_to_import_history_reimport_same_file(db_transact): helper_create_import_history_table(db_transact) f = NamedTemporaryFile(suffix=".csv") new_id = dbu.add_to_import_history( filepath=pathlib.Path(f.name), db_connection=db_transact, filetype="test1" ) assert new_id == 1 with pytest.raises(Exception) as excinfo: new_id = dbu.add_to_import_history( filepath=pathlib.Path(f.name), db_connection=db_transact, filetype="test1" ) assert "Key (filehash)=(d41d8cd98f00b204e9800998ecf8427e) already exists" \ in str(excinfo.value)
def test_check_if_file_imported_reimport_altered_file(db_transact): helper_create_import_history_table(db_transact) f = NamedTemporaryFile(suffix=".csv") filepath = pathlib.Path(f.name) new_id = dbu.add_to_import_history( filepath=filepath, db_connection=db_transact, filetype="test1" ) assert new_id == 1 f.write(b"Hello, World") f.seek(0) new_id = dbu.add_to_import_history( filepath=filepath, db_connection=db_transact, filetype="test1" ) assert new_id == 2 is_imported = dbu.check_if_file_imported(filepath, db_transact) assert is_imported == 2
def test_check_if_file_imported_simple(db_transact): helper_create_import_history_table(db_transact) f = NamedTemporaryFile(suffix=".csv") filepath = pathlib.Path(f.name) new_id = dbu.add_to_import_history( filepath=filepath, db_connection=db_transact, filetype="test1" ) assert new_id == 1 is_imported = dbu.check_if_file_imported(filepath, db_transact) assert is_imported == 1
def test_check_if_file_imported_second_file_same_hash_not(db_transact): helper_create_import_history_table(db_transact) f = NamedTemporaryFile(suffix=".csv") new_id = dbu.add_to_import_history( filepath=pathlib.Path(f.name), db_connection=db_transact, filetype="test1" ) assert new_id == 1 f2 = NamedTemporaryFile(suffix=".csv") filepath2 = pathlib.Path(f2.name) is_imported = dbu.check_if_file_imported(filepath2, db_transact) assert is_imported is None
def import_single_file( filepath, db_engine, data_files_path=pathlib.PurePosixPath('/') ): """Orchestrate reading and import a file.""" if dbu.check_if_file_imported(filepath, db_engine): logging.info( f"Already imported: {filepath.relative_to(data_files_path)}" ) return None logging.info(f"Importing: {filepath.relative_to(data_files_path)}") logging.debug(f"Absolute path: {filepath}") file_info = _determine_file_type(filepath=filepath) schemaname = 'rawdata' with db_engine.begin() as db_con: columns_info = dbu.get_db_column_info( db_connection=db_con, tablename=file_info['tablename'], schemaname=schemaname ) columns_name_list = list(columns_info['column_name']) df = file_info['parser'](filepath, columns_name_list) # Using the context manager allows the adding to import history and writing # to DB to be in the same transaction, and it will rollback if it fails. with db_engine.begin() as db_con: import_id = dbu.add_to_import_history( filepath=filepath, db_connection=db_con, filetype=file_info['filetype'] ) df['import_history_id'] = import_id dbu.write_df_to_db( df=df, db_connection=db_con, tablename=file_info['tablename'], schemaname=schemaname ) return import_id