def test_watcher_parses_existing_files(): raw_data_dir = TemporaryDirectory(dir="/tmp", prefix="raw_data") with open( join(raw_data_dir.name, f"events_data_{str(random.getrandbits(50))}.json"), "w" ) as f: f.write(get_mock_json()) f.flush() queue = Queue() Watcher(path=raw_data_dir.name, watchdog_queue=queue, rabbitmq_queue=TEST_QUEUE) raw_data_dir.cleanup()
def test_extract_data(): """This tests the extraction function from a JSON file. NamedTemporaryFile/Temporary files are used for creating temporary environments to store the outputs/inputs and read from them for assertions. """ expected = [ "id,event_type,event_ts\n", "foo,created,2020-12-08 20:03:16.759617\n", "bar,created,2014-12-08 20:03:16.759617\n", ] with NamedTemporaryFile() as inputfile: inputfile.write(get_mock_json().encode("utf-8")) inputfile.flush() with TemporaryDirectory(dir="/tmp") as tmpdir: outputfile = join(tmpdir, "test") extract_data(src_path=inputfile.name, dst_path=outputfile) with open(outputfile, "r") as f: data = f.readlines() assert expected == data
def test_watcher_detects_new_file(): raw_data_dir = TemporaryDirectory(dir="/tmp", prefix="raw_data") queue = Queue() watcher = Watcher( path=raw_data_dir.name, watchdog_queue=queue, rabbitmq_queue=TEST_QUEUE ) watcher_thread = threading.Thread(target=watcher.start, daemon=True) watcher_thread.start() with open( join(raw_data_dir.name, f"events_data_{str(random.getrandbits(50))}.json"), "w" ) as f: f.write(get_mock_json()) f.flush() sleep(5) # wait for the watcher to parse the file watcher_thread._running = False raw_data_dir.cleanup()
def test_import_sources(): """This function includes the entire mechanism of extracting the data and loading it to the database. It then verifies that the data exists and asserts its content. TemporaryDirectories and NamedTemporaryFiles are used for simulating the environment. The import sources function uses prefix to pick up files for loading, and therefore requires that the raw data is stored in the following structure: . └── raw_data directory └── subdir (e.g. organization_data) ├── foo.csv └── bar.csv """ raw_data_dir = TemporaryDirectory(dir="/tmp", prefix="raw_data") test_data_dir = join(raw_data_dir.name, "test") mkdir(test_data_dir) raw_data_file = NamedTemporaryFile(dir=test_data_dir, prefix="test", suffix=".json") raw_data_file.write(get_mock_json().encode("utf-8")) raw_data_file.flush() table_md_yaml = get_mock_table_md_yaml() with TemporaryDirectory(dir="/tmp", prefix="table_metadata") as md_dir: with NamedTemporaryFile(dir=md_dir) as md: md.write(table_md_yaml.encode("utf-8")) md.flush() import_sources(tables_md_dir=md_dir, raw_data_dir=raw_data_dir.name) raw_data_file.close() raw_data_dir.cleanup() expected = [ ("foo", "created", datetime(2020, 12, 8, 20, 3, 16, 759617)), ("bar", "created", datetime(2014, 12, 8, 20, 3, 16, 759617)), ] pg_hook = PgHook() with pg_hook.get_conn() as conn: with conn.cursor() as cur: cur.execute("SELECT * FROM test.test_table_delta;") assert cur.fetchall() == expected
def publish_for_consumption(): producer = Producer(host=RABBIT_MQ_HOST, queue=TEST_QUEUE) data = json.loads(get_mock_json()) for msg in data: producer.publish_event(msg=json.dumps(msg))