def test_fetch_integration(self, send_update, queue_render): queue_render.side_effect = async_value(None) send_update.side_effect = async_value(None) workflow = Workflow.create_and_init() create_module_zipfile( "mod", python_code= ("import pandas as pd\ndef fetch(params): return pd.DataFrame({'A': [1]})\ndef render(table, params): return table" ), ) wf_module = workflow.tabs.first().wf_modules.create( order=0, slug="step-1", module_id_name="mod") cjwstate.modules.init_module_system() now = timezone.now() with self.assertLogs(level=logging.INFO): self.run_with_async_db( fetch.fetch(workflow_id=workflow.id, wf_module_id=wf_module.id, now=now)) wf_module.refresh_from_db() so = wf_module.stored_objects.get( stored_at=wf_module.stored_data_version) with minio.temporarily_download(minio.StoredObjectsBucket, so.key) as parquet_path: table = pyarrow.parquet.read_table(str(parquet_path), use_threads=False) assert_arrow_table_equals(table, {"A": [1]}) workflow.refresh_from_db() queue_render.assert_called_with(workflow.id, workflow.last_delta_id) send_update.assert_called()
def test_fetch_integration(self, send_update, queue_render): queue_render.side_effect = async_value(None) send_update.side_effect = async_value(None) workflow = Workflow.create_and_init() create_module_zipfile( "mod", python_code= ("import pandas as pd\ndef fetch(params): return pd.DataFrame({'A': [1]})\ndef render(table, params): return table" ), ) step = workflow.tabs.first().steps.create(order=0, slug="step-1", module_id_name="mod") cjwstate.modules.init_module_system() now = datetime.datetime.now() with self.assertLogs(level=logging.INFO): self.run_with_async_db( fetch.fetch(workflow_id=workflow.id, step_id=step.id, now=now)) step.refresh_from_db() so = step.stored_objects.get(stored_at=step.stored_data_version) with s3.temporarily_download(s3.StoredObjectsBucket, so.key) as parquet_path: # fetch results are stored without a schema. Let's hard-code a # schema simply so we can test that the table data is the same. table = read_parquet_as_arrow(parquet_path, [Column("A", ColumnType.Number())]) assert_arrow_table_equals(table, make_table(make_column("A", [1]))) workflow.refresh_from_db() queue_render.assert_called_with(workflow.id, workflow.last_delta_id) send_update.assert_called()
def test_fetch_integration(self, send_update, queue_render): queue_render.side_effect = async_value(None) send_update.side_effect = async_value(None) workflow = Workflow.create_and_init() ModuleVersion.create_or_replace_from_spec( {"id_name": "mod", "name": "Mod", "category": "Clean", "parameters": []}, source_version_hash="abc123", ) wf_module = workflow.tabs.first().wf_modules.create( order=0, slug="step-1", module_id_name="mod" ) minio.put_bytes( minio.ExternalModulesBucket, "mod/abc123/code.py", b"import pandas as pd\ndef fetch(params): return pd.DataFrame({'A': [1]})\ndef render(table, params): return table", ) cjwstate.modules.init_module_system() now = timezone.now() with self.assertLogs(level=logging.INFO): self.run_with_async_db( fetch.fetch(workflow_id=workflow.id, wf_module_id=wf_module.id, now=now) ) wf_module.refresh_from_db() so = wf_module.stored_objects.get(stored_at=wf_module.stored_data_version) with minio.temporarily_download(so.bucket, so.key) as parquet_path: table = pyarrow.parquet.read_table(str(parquet_path), use_threads=False) assert_arrow_table_equals(table, {"A": [1]}) workflow.refresh_from_db() queue_render.assert_called_with(workflow.id, workflow.last_delta_id) send_update.assert_called()
def test_fetch_integration_tempfiles_are_on_disk(self, create_result): # /tmp is RAM; /var/tmp is disk. Assert big files go on disk. workflow = Workflow.create_and_init() create_module_zipfile( "mod", python_code= ("import pandas as pd\ndef fetch(params): return pd.DataFrame({'A': [1]})\ndef render(table, params): return table" ), ) wf_module = workflow.tabs.first().wf_modules.create( order=0, slug="step-1", module_id_name="mod") with self.assertLogs(level=logging.INFO): cjwstate.modules.init_module_system() self.run_with_async_db( fetch.fetch(workflow_id=workflow.id, wf_module_id=wf_module.id)) create_result.assert_called() saved_result: FetchResult = create_result.call_args[0][2] self.assertRegex(str(saved_result.path), r"/var/tmp/")
def test_fetch_integration_tempfiles_are_on_disk(self, create_result): # /tmp is RAM; /var/tmp is disk. Assert big files go on disk. workflow = Workflow.create_and_init() ModuleVersion.create_or_replace_from_spec( {"id_name": "mod", "name": "Mod", "category": "Clean", "parameters": []}, source_version_hash="abc123", ) wf_module = workflow.tabs.first().wf_modules.create( order=0, slug="step-1", module_id_name="mod" ) minio.put_bytes( minio.ExternalModulesBucket, "mod/abc123/code.py", b"import pandas as pd\ndef fetch(params): return pd.DataFrame({'A': [1]})\ndef render(table, params): return table", ) with self.assertLogs(level=logging.INFO): cjwstate.modules.init_module_system() self.run_with_async_db( fetch.fetch(workflow_id=workflow.id, wf_module_id=wf_module.id) ) create_result.assert_called() saved_result: FetchResult = create_result.call_args[0][2] self.assertRegex(str(saved_result.path), r"/var/tmp/")