def test_fetch_get_stored_dataframe_happy_path(self): async def fetch(params, *, get_stored_dataframe): df = await get_stored_dataframe() assert_frame_equal(df, pd.DataFrame({"A": [1]})) with parquet_file({"A": [1]}, dir=self.basedir) as parquet_path: self._test_fetch(fetch, last_fetch_result=FetchResult(parquet_path, []))
def test_fetch_get_input_dataframe_happy_path(self): async def fetch(params, *, get_input_dataframe): df = await get_input_dataframe() assert_frame_equal(df, pd.DataFrame({"A": [1]})) with parquet_file({"A": [1]}, dir=self.basedir) as parquet_path: self._test_fetch(fetch, input_table_parquet_path=parquet_path)
def test_default_render_returns_fetch_result(self): # Functionality used by libraryofcongress with ExitStack() as ctx: input_arrow_table = ctx.enter_context( arrow_table_context({"A": [1]}, dir=self.basedir) ) parquet_filename = Path( ctx.enter_context(parquet_file({"A": [2]}, dir=self.basedir)).name ).name out_filename = ctx.enter_context(tempfile_context(dir=self.basedir)).name thrift_result = module.render_thrift( ttypes.RenderRequest( str(self.basedir), input_arrow_table.to_thrift(), Params({}).to_thrift(), ttypes.Tab("tab-1", "Tab 1"), ttypes.FetchResult( parquet_filename, [RenderError(I18nMessage.TODO_i18n("A warning")).to_thrift()], ), out_filename, ) ) result = RenderResult.from_thrift(thrift_result, self.basedir) assert_render_result_equals( result, RenderResult( arrow_table({"A": [2]}), [RenderError(I18nMessage.TODO_i18n("A warning"))], ), )
def test_slice_lots_of_types(self): dt1 = datetime(2019, 12, 18, 23, 33, 55, 123000) dt2 = datetime(2019, 12, 18) with parquet_file({ "str": ["x", "y", None, ""], "cat": pa.array(["x", "y", None, ""]).dictionary_encode(), "dt": pa.array([dt1, None, dt2, None], pa.timestamp("ns")), "int32": [1, 2, None, 2**31], "float": [1.1, None, 3.3, 4.4], }) as path: self.assertEqual( parquet.read_slice_as_text(path, "csv", range(5), range(4)), "\n".join([ "str,cat,dt,int32,float", "x,x,2019-12-18T23:33:55.123Z,1,1.1", "y,y,,2,", ",,2019-12-18,,3.3", ",,,2147483648,4.4", ]), ) self.assertEqual( parquet.read_slice_as_text(path, "json", range(5), range(4)), "".join([ "[", '{"str":"x","cat":"x","dt":"2019-12-18T23:33:55.123Z","int32":1,"float":1.1},', '{"str":"y","cat":"y","dt":null,"int32":2,"float":null},', '{"str":null,"cat":null,"dt":"2019-12-18","int32":null,"float":3.3},', '{"str":"","cat":"","dt":null,"int32":2147483648,"float":4.4}', "]", ]), )
def test_fetch_result_happy_path(self): workflow = Workflow.create_and_init() tab = workflow.tabs.first() wf_module = tab.wf_modules.create( order=0, slug="step-1", module_id_name="x", last_relevant_delta_id=workflow.last_delta_id, fetch_error="maybe an error", ) with parquet_file({"A": [1]}) as path: so = create_stored_object(workflow.id, wf_module.id, path) wf_module.stored_data_version = so.stored_at wf_module.save(update_fields=["stored_data_version"]) def render(*args, fetch_result, **kwargs): self.assertEqual( fetch_result.errors, [RenderError(I18nMessage.TODO_i18n("maybe an error"))], ) assert_arrow_table_equals( pyarrow.parquet.read_table(str(fetch_result.path)), {"A": [1]}) return RenderResult() with self._stub_module(render): self.run_with_async_db( execute_wfmodule( workflow, wf_module, {}, Tab(tab.slug, tab.name), RenderResult(), {}, Path("/unused"), ))
def test_render_deprecated_parquet(self): with parquet_file({"A": [1, 2], "B": [3, 4]}) as fetched_path: result = render_arrow( ArrowTable(), P(), "tab-x", FetchResult(fetched_path), self.output_path ) assert_arrow_table_equals(result.table, {"A": [1, 2], "B": [3, 4]}) self.assertEqual(result.errors, [])
def test_fetch_result_deleted_file_means_none(self): workflow = Workflow.create_and_init() tab = workflow.tabs.first() wf_module = tab.wf_modules.create( order=0, slug="step-1", module_id_name="x", last_relevant_delta_id=workflow.last_delta_id, ) with parquet_file({"A": [1]}) as path: so = create_stored_object(workflow.id, wf_module.id, path) wf_module.stored_data_version = so.stored_at wf_module.save(update_fields=["stored_data_version"]) # Now delete the file on S3 -- but leave the DB pointing to it. minio.remove(so.bucket, so.key) def render(*args, fetch_result, **kwargs): self.assertIsNone(fetch_result) return RenderResult() with self._stub_module(render): self.run_with_async_db( execute_wfmodule( self.chroot_context, workflow, wf_module, {}, Tab(tab.slug, tab.name), RenderResult(), {}, self.output_path, ))
def test_pydict_zero_row_groups(self): table = pyarrow.Table.from_batches([], schema=pyarrow.schema([ ("A", pyarrow.string()) ])) with parquet_file(table) as path: self.assertEqual(parquet.read_pydict(path, range(1), range(0)), {"A": []})
def test_render_with_parquet_fetch_result(self): def render(*args, fetch_result): return fetch_result with parquet_file({"A": ["fetched"]}, dir=self.basedir) as pf: result = self._test_render(render, fetch_result=FetchResult(pf)) assert_render_result_equals( result, RenderResult(arrow_table({"A": ["fetched"]})))
def test_render_deprecated_parquet(self): with parquet_file({"A": [1, 2], "B": [3, 4]}) as fetched_path: with self.render(P(), FetchResult(fetched_path)) as result: assert_arrow_table_equals(result.table, { "A": [1, 2], "B": [3, 4] }) self.assertEqual(result.errors, [])
def test_slice_ignore_missing_columns(self): with parquet_file({"A": [1]}) as path: self.assertEqual( parquet.read_slice_as_text(path, "csv", range(3), range(1)), "A\n1") self.assertEqual( parquet.read_slice_as_text(path, "json", range(3), range(1)), '[{"A":1}]', )
def test_render_deprecated_parquet_warning(self): errors = [RenderError(I18nMessage.TODO_i18n("truncated table"))] with parquet_file({"A": [1, 2], "B": [3, 4]}) as fetched_path: with self.render(P(), FetchResult(fetched_path, errors)) as result: assert_arrow_table_equals(result.table, { "A": [1, 2], "B": [3, 4] }) self.assertEqual(result.errors, errors)
def test_load_selected_stored_object(self): workflow = Workflow.create_and_init() step = workflow.tabs.first().steps.create( order=0, slug="step-1", module_id_name="foodeleted" ) with parquet_file({"A": [1]}) as path1: storedobjects.create_stored_object(workflow.id, step.id, path1) with parquet_file({"A": [2]}) as path2: so2 = storedobjects.create_stored_object(workflow.id, step.id, path2) with parquet_file({"A": [3]}) as path3: storedobjects.create_stored_object(workflow.id, step.id, path3) step.stored_data_version = so2.stored_at step.save(update_fields=["stored_data_version"]) result = self.run_with_async_db( fetch.load_database_objects(workflow.id, step.id) ) self.assertEqual(result[3], so2) self.assertEqual(result.stored_object, so2)
def test_storage_limits(self, limit): workflow = Workflow.create_and_init() wf_module = workflow.tabs.first().wf_modules.create(order=0, slug="step-1") with parquet_file({"A": [1], "B": ["x"]}) as parquet_path: self.run_with_async_db( save.create_result(workflow.id, wf_module, FetchResult(parquet_path), timezone.now())) limit.assert_called_with(wf_module)
def test_slice_zero_row_groups(self): table = pa.Table.from_batches([], schema=pa.schema([("A", pa.string())])) with parquet_file(table) as path: self.assertEqual( parquet.read_slice_as_text(path, "csv", range(1), range(0)), "A") self.assertEqual( parquet.read_slice_as_text(path, "json", range(1), range(0)), "[]")
def test_pydict_nan(self): with parquet_file({ "A": pyarrow.array([1.1, float("nan"), None], type=pyarrow.float64()) }) as path: result = parquet.read_pydict(path, range(1), range(3)) self.assertEqual(result["A"][0], 1.1) self.assert_(math.isnan(result["A"][1])) self.assert_(math.isnan(result["A"][2]))
def test_fetch_result_happy_path(self): workflow = Workflow.create_and_init() tab = workflow.tabs.first() step = tab.steps.create( order=0, slug="step-1", module_id_name="x", last_relevant_delta_id=workflow.last_delta_id, fetch_errors=[ RenderError(I18nMessage("foo", {}, "module")), RenderError(I18nMessage("bar", {"x": "y"}, "cjwmodule")), ], ) with parquet_file({"A": [1]}) as path: so = create_stored_object(workflow.id, step.id, path) step.stored_data_version = so.stored_at step.save(update_fields=["stored_data_version"]) module_zipfile = create_module_zipfile( "x", spec_kwargs={"loads_data": True}, python_code=textwrap.dedent( """ import pyarrow as pa import pandas as pd from pandas.testing import assert_frame_equal from cjwkernel.types import RenderError, I18nMessage def render(table, params, *, fetch_result, **kwargs): assert fetch_result.errors == [ RenderError(I18nMessage("foo", {}, "module")), RenderError(I18nMessage("bar", {"x": "y"}, "cjwmodule")), ] fetch_dataframe = pa.parquet.read_table(str(fetch_result.path)) assert_frame_equal(fetch_dataframe, pd.DataFrame({"A": [1]})) return pd.DataFrame() """ ), ) with self.assertLogs(level=logging.INFO): self.run_with_async_db( execute_step( chroot_context=self.chroot_context, workflow=workflow, step=step, module_zipfile=module_zipfile, params={}, tab_name=tab.name, input_path=self.empty_table_path, input_table_columns=[], tab_results={}, output_path=self.output_path, ) )
def test_race_hard_deleted_wf_module(self): workflow = Workflow.create_and_init() wf_module = workflow.tabs.first().wf_modules.create(order=0, slug="step-1") WfModule.objects.filter(id=wf_module.id).delete() # Don't crash with parquet_file({"A": [1], "B": ["x"]}) as parquet_path: self.run_with_async_db( save.create_result(workflow.id, wf_module, FetchResult(parquet_path), timezone.now()))
def test_render_with_parquet_fetch_result(self): def render(table, params, *, fetch_result): return fetch_result with ModuleTestEnv(render=render) as env: with parquet_file({"A": ["fetched"]}, dir=env.basedir) as pf: outcome = env.call_render(make_table(), {}, fetch_result=FetchResult(pf)) assert_arrow_table_equals( outcome.read_table(), make_table(make_column("A", ["fetched"])))
def test_slice_rows(self): with parquet_file({"A": [0, 1, 2, 3, 4, 5, 6, 7]}) as path: self.assertEqual( parquet.read_slice_as_text(path, "csv", range(1), range(2, 5)), "A\n2\n3\n4", ) self.assertEqual( parquet.read_slice_as_text(path, "json", range(1), range(2, 5)), '[{"A":2},{"A":3},{"A":4}]', )
def test_render_deprecated_parquet_warning(self): errors = [RenderError(I18nMessage.TODO_i18n("truncated table"))] with parquet_file({"A": [1, 2], "B": [3, 4]}) as fetched_path: result = render_arrow( ArrowTable(), P(), "tab-x", FetchResult(fetched_path, errors=errors), self.output_path, ) assert_arrow_table_equals(result.table, {"A": [1, 2], "B": [3, 4]}) self.assertEqual(result.errors, errors)
def test_storage_limits(self, limit): workflow = Workflow.create_and_init() step = workflow.tabs.first().steps.create(order=0, slug="step-1") with parquet_file({"A": [1], "B": ["x"]}) as parquet_path: self.run_with_async_db( save.create_result( workflow.id, step, FetchResult(parquet_path), datetime.datetime.now(), )) limit.assert_called_with(step=step)
def test_race_soft_deleted_wf_module(self): workflow = Workflow.create_and_init() wf_module = workflow.tabs.first().wf_modules.create(order=0, slug="step-1", is_deleted=True) workflow_id = workflow.id workflow.delete() # Don't crash with parquet_file({"A": [1], "B": ["x"]}) as parquet_path: self.run_with_async_db( save.create_result(workflow_id, wf_module, FetchResult(parquet_path), timezone.now())) self.assertEqual(wf_module.stored_objects.count(), 0)
def test_race_hard_deleted_step(self): workflow = Workflow.create_and_init() step = workflow.tabs.first().steps.create(order=0, slug="step-1") Step.objects.filter(id=step.id).delete() # Don't crash with parquet_file({"A": [1], "B": ["x"]}) as parquet_path: self.run_with_async_db( save.create_result( workflow.id, step, FetchResult(parquet_path), datetime.datetime.now(), ))
def test_default_render_returns_fetch_result(self): # Functionality used by libraryofcongress # # TODO nix this functionality. with ModuleTestEnv() as env: with parquet_file({"A": [2]}, dir=env.basedir) as parquet_path: outcome = env.call_render( make_table(), {}, fetch_result=FetchResult( path=parquet_path, errors=[FetchError(TODO_i18n("A warning"))]), ) self.assertEqual( outcome.result, RenderResult([RenderError(TODO_i18n("A warning"))])) assert_arrow_table_equals(outcome.read_table(), make_table(make_column("A", [2])))
def test_fetch_result_deleted_file_means_none(self): workflow = Workflow.create_and_init() tab = workflow.tabs.first() step = tab.steps.create( order=0, slug="step-1", module_id_name="x", last_relevant_delta_id=workflow.last_delta_id, ) with parquet_file({"A": [1]}) as path: so = create_stored_object(workflow.id, step.id, path) step.stored_data_version = so.stored_at step.save(update_fields=["stored_data_version"]) # Now delete the file on S3 -- but leave the DB pointing to it. s3.remove(s3.StoredObjectsBucket, so.key) module_zipfile = create_module_zipfile( "x", spec_kwargs={"loads_data": True}, python_code=textwrap.dedent( """ import pandas as pd def render(table, params, *, fetch_result, **kwargs): assert fetch_result is None return pd.DataFrame() """ ), ) with self.assertLogs(level=logging.INFO): self.run_with_async_db( execute_step( chroot_context=self.chroot_context, workflow=workflow, step=step, module_zipfile=module_zipfile, params={}, tab_name=tab.name, input_path=self.empty_table_path, input_table_columns=[], tab_results={}, output_path=self.output_path, ) )
def test_fetch_result_deleted_file_means_none(self): workflow = Workflow.create_and_init() tab = workflow.tabs.first() wf_module = tab.wf_modules.create( order=0, slug="step-1", module_id_name="x", last_relevant_delta_id=workflow.last_delta_id, ) with parquet_file({"A": [1]}) as path: so = create_stored_object(workflow.id, wf_module.id, path) wf_module.stored_data_version = so.stored_at wf_module.save(update_fields=["stored_data_version"]) # Now delete the file on S3 -- but leave the DB pointing to it. minio.remove(minio.StoredObjectsBucket, so.key) def render(*args, fetch_result, **kwargs): self.assertIsNone(fetch_result) return RenderResult() module_zipfile = create_module_zipfile( "x", python_code=textwrap.dedent(""" import pandas as pd def render(table, params, *, fetch_result, **kwargs): assert fetch_result is None return pd.DataFrame() """), ) with self.assertLogs(level=logging.INFO): self.run_with_async_db( execute_wfmodule( self.chroot_context, workflow, wf_module, module_zipfile, {}, Tab(tab.slug, tab.name), RenderResult(), {}, self.output_path, ))
def test_default_render_returns_fetch_result(self): # Functionality used by libraryofcongress with ExitStack() as ctx: input_arrow_table = ctx.enter_context( arrow_table_context({"A": [1]}, dir=self.basedir) ) parquet_filename = Path( ctx.enter_context(parquet_file({"A": [2]}, dir=self.basedir)).name ).name out_filename = ctx.enter_context(tempfile_context(dir=self.basedir)).name thrift_result = module.render_thrift( ttypes.RenderRequest( str(self.basedir), arrow_arrow_table_to_thrift(input_arrow_table), {}, # params ttypes.Tab("tab-1", "Tab 1"), ttypes.FetchResult( parquet_filename, [ ttypes.RenderError( ttypes.I18nMessage( "TODO_i18n", { "text": ttypes.I18nArgument( string_value="A warning" ) }, ), [], ) ], ), out_filename, ) ) result = thrift_render_result_to_arrow(thrift_result, self.basedir) assert_render_result_equals( result, RenderResult( arrow_table({"A": [2]}), [RenderError(I18nMessage.TODO_i18n("A warning"))], ), )
def test_create_result(self, send_update): send_update.side_effect = async_noop workflow = Workflow.create_and_init() wf_module = workflow.tabs.first().wf_modules.create( order=0, slug="step-1", is_busy=True, fetch_errors=[RenderError(I18nMessage("foo", {}, "module"))], ) now = timezone.datetime(2019, 10, 22, 12, 22, tzinfo=timezone.utc) with parquet_file({"A": [1], "B": ["x"]}) as parquet_path: self.run_with_async_db( save.create_result( workflow.id, wf_module, FetchResult(parquet_path), now ) ) self.assertEqual(wf_module.stored_objects.count(), 1) self.assertEqual(wf_module.fetch_errors, []) self.assertEqual(wf_module.is_busy, False) self.assertEqual(wf_module.last_update_check, now) wf_module.refresh_from_db() self.assertEqual(wf_module.fetch_errors, []) self.assertEqual(wf_module.is_busy, False) self.assertEqual(wf_module.last_update_check, now) send_update.assert_called_with( workflow.id, clientside.Update( steps={ wf_module.id: clientside.StepUpdate( is_busy=False, last_fetched_at=now ) } ), ) workflow.refresh_from_db() self.assertIsInstance(workflow.last_delta, ChangeDataVersionCommand)
def test_pydict_lots_of_types(self): dt1 = datetime.now() dt2 = datetime.now() with parquet_file({ "str": ["x", "y", None, "z"], "cat": pyarrow.array(["x", "y", None, "x"]).dictionary_encode(), "dt": [dt1, None, dt2, None], "int32": [1, 2, 3, 2**31], "float": [1.1, 2.2, 3.3, 4.4], }) as path: self.assertEqual( parquet.read_pydict(path, range(5), range(4)), { "str": ["x", "y", None, "z"], "cat": ["x", "y", None, "x"], "dt": [dt1, None, dt2, None], "int32": [1, 2, 3, 2**31], "float": [1.1, 2.2, 3.3, 4.4], }, )