def test_parquet_same_data_different_bytes(self): cjwparquet.write(self.old_path, make_table(make_column("A", ["a"]))) cjwparquet.write(self.new_path, make_table(make_column("A", ["a"], dictionary=True))) self.assertTrue( are_fetch_results_equal(FetchResult(self.old_path), FetchResult(self.new_path)))
def test_simple(self): self.kernel.fetch.return_value = FetchResult(self.output_path) module_zipfile = create_module_zipfile( "mod", spec_kwargs={"parameters": [{ "id_name": "A", "type": "string" }]}) with self.assertLogs("fetcher.fetch", level=logging.INFO): result = fetch.fetch_or_wrap_error( self.ctx, self.chroot_context, self.basedir, "mod", module_zipfile, {"A": "B"}, {"C": "D"}, None, None, self.output_path, ) self.assertEqual(result, FetchResult(self.output_path, [])) self.assertEqual( self.kernel.fetch.call_args[1]["compiled_module"], module_zipfile.compile_code_without_executing(), ) self.assertEqual(self.kernel.fetch.call_args[1]["params"], Params({"A": "B"})) self.assertEqual(self.kernel.fetch.call_args[1]["secrets"], {"C": "D"}) self.assertIsNone(self.kernel.fetch.call_args[1]["last_fetch_result"]) self.assertIsNone( self.kernel.fetch.call_args[1]["input_parquet_filename"])
def test_pass_last_fetch_result(self, downloaded_file): last_result_path = self.ctx.enter_context( tempfile_context(prefix="last-result") ) result_path = self.ctx.enter_context(tempfile_context(prefix="result")) self.kernel.fetch.return_value = FetchResult(result_path, []) with self.assertLogs("fetcher.fetch", level=logging.INFO): fetch.fetch_or_wrap_error( self.ctx, self.chroot_context, self.basedir, "mod", create_module_zipfile("mod"), {}, {}, FetchResult(last_result_path, []), None, self.output_path, ) self.assertEqual( self.kernel.fetch.call_args[1]["last_fetch_result"], FetchResult(last_result_path, []), )
def test_simple(self, load_module): load_module.return_value.migrate_params.return_value = {"A": "B"} load_module.return_value.fetch.return_value = FetchResult(self.output_path, []) result = fetch.fetch_or_wrap_error( self.ctx, self.chroot_context, self.basedir, WfModule(params={"A": "input"}, secrets={"C": "wrong"}), MockModuleVersion( id_name="A", param_schema=ParamDType.Dict({"A": ParamDType.String()}) ), {"C": "D"}, None, None, self.output_path, ) self.assertEqual(result, FetchResult(self.output_path, [])) load_module.return_value.migrate_params.assert_called_with({"A": "input"}) load_module.return_value.fetch.assert_called_with( chroot_context=self.chroot_context, basedir=self.basedir, params=Params({"A": "B"}), secrets={"C": "D"}, last_fetch_result=None, input_parquet_filename=None, output_filename=self.output_path.name, )
def test_different_errors(self): self.assertFalse( are_fetch_results_equal( FetchResult(self.old_path, [RenderError(I18nMessage("foo", {}, None))]), FetchResult(self.old_path, [RenderError(I18nMessage("bar", {}, None))]), ))
def test_bytes_different(self): self.old_path.write_bytes( b"slakdjhgt34kj5hlekretjhse3lk4j5ho234kj5rthsadf") self.new_path.write_bytes( b"salkdfhgbo324iu5q34rlkiuw3e47ytedasdfgaksjhg3r") self.assertFalse( are_fetch_results_equal(FetchResult(self.old_path), FetchResult(self.new_path)))
def test_parquet_same_data_different_bytes(self): parquet.write(self.old_path, arrow_table({"A": ["a"]}).table) parquet.write( self.new_path, arrow_table({ "A": pyarrow.array(["a"]).dictionary_encode() }).table, ) self.assertTrue( are_fetch_results_equal(FetchResult(self.old_path), FetchResult(self.new_path)))
async def do_download( sheet_id: str, sheet_mime_type: str, oauth2_client: oauth2.Client, output_path: Path ) -> FetchResult: """ Download spreadsheet from Google. If `sheet_mime_type` is 'application/vnd.google-apps.spreadsheet', use GDrive API to _export_ a text/csv. Otherwise, use GDrive API to _download_ the file. """ if sheet_mime_type == "application/vnd.google-apps.spreadsheet": url = _generate_google_sheet_url(sheet_id) sheet_mime_type = "text/csv" else: url = _generate_gdrive_file_url(sheet_id) # and use the passed sheet_mime_type url, headers, _ = oauth2_client.add_token(url, headers={}) try: await httpfile.download(url, output_path, headers=headers, ssl=SSL_CONTEXT) except HttpError.NotSuccess as err: response = err.response if response.status_code == 401: return TODO_i18n_fetch_error( output_path, "Invalid credentials. Please reconnect to Google Drive." ) elif response.status_code == 403: return TODO_i18n_fetch_error( output_path, "You chose a file your logged-in user cannot access. Please reconnect to Google Drive or choose a different file.", ) elif response.status_code == 404: return TODO_i18n_fetch_error( output_path, "File not found. Please choose a different file." ) else: # HACK: *err.i18n_message because i18n_message is a tuple # compatible with I18nMessage() ctor return FetchResult( output_path, errors=[RenderError(I18nMessage(*err.i18n_message))] ) except HttpError as err: # HACK: *err.i18n_message because i18n_message is a tuple # compatible with I18nMessage() ctor return FetchResult( output_path, errors=[RenderError(I18nMessage(*err.i18n_message))] ) return FetchResult(output_path)
def test_fetch_get_stored_dataframe_happy_path(self): async def fetch(params, *, get_stored_dataframe): df = await get_stored_dataframe() assert_frame_equal(df, pd.DataFrame({"A": [1]})) with parquet_file({"A": [1]}, dir=self.basedir) as parquet_path: self._test_fetch(fetch, last_fetch_result=FetchResult(parquet_path, []))
def test_fetch_truncate(self): def fetch(params): return pd.DataFrame({"A": [1, 2, 3]}) with tempfile_context(dir=self.basedir) as outfile: result = self._test_fetch(fetch, output_filename=outfile.name) self.assertEqual( result, FetchResult( outfile, errors=[ FetchError( I18nMessage( "py.cjwkernel.pandas.types.ProcessResult.truncate_in_place_if_too_big.warning", { "old_number": 3, "new_number": 2 }, None, )) ], ), ) assert_arrow_table_equals( read_parquet_as_arrow( outfile, [Column("A", ColumnType.Number("{:,}"))]), make_table(make_column("A", [1, 2])), )
def test_render_xlsx_bad_content(self): with tempfile_context("fetch-") as http_path: httpfile.write( http_path, {"url": "http://example.com/hello"}, "200 OK", [("content-type", XLSX_MIME_TYPE)], io.BytesIO("ceçi n'est pas une .xlsx".encode("utf-8")), ) result = render_arrow( ArrowTable(), P(has_header=True), "tab-x", FetchResult(http_path), self.output_path, ) self.assertEqual( result, RenderResult( ArrowTable(), [ RenderError( I18nMessage.TODO_i18n( 'Error reading Excel file: Unsupported format, or corrupt file: Expected BOF record; found b"ce\\xc3\\xa7i n\'"' ) ) ], ), )
def test_render_deprecated_parquet(self): with parquet_file({"A": [1, 2], "B": [3, 4]}) as fetched_path: result = render_arrow( ArrowTable(), P(), "tab-x", FetchResult(fetched_path), self.output_path ) assert_arrow_table_equals(result.table, {"A": [1, 2], "B": [3, 4]}) self.assertEqual(result.errors, [])
def _test_fetch( self, fetch_fn, *, params={}, secrets={}, last_fetch_result=None, input_table_parquet_path=None, output_filename=None, ): with ExitStack() as ctx: ctx.enter_context(patch.object(module, "fetch", fetch_fn)) if output_filename is None: # Make a temporary output filename -- this will make `fetch()` # complete, but callers won't be able to see the data it # outputs because we'll delete the file too soon. output_filename = ctx.enter_context( tempfile_context(dir=self.basedir)).name thrift_result = module.fetch_thrift( ttypes.FetchRequest( basedir=str(self.basedir), params=Params(params).to_thrift(), secrets=RawParams(secrets).to_thrift(), last_fetch_result=(last_fetch_result.to_thrift() if last_fetch_result is not None else None), input_table_parquet_filename=(input_table_parquet_path.name if input_table_parquet_path is not None else None), output_filename=output_filename, )) return FetchResult.from_thrift(thrift_result, self.basedir)
def test_input_crr_corrupt_cache_error_is_none( self, downloaded_parquet_file, load_module ): load_module.return_value.migrate_params.return_value = {} load_module.return_value.fetch.return_value = FetchResult(self.output_path, []) downloaded_parquet_file.side_effect = rendercache.CorruptCacheError( "file not found" ) input_metadata = TableMetadata(3, [Column("A", ColumnType.Text())]) input_crr = CachedRenderResult(1, 2, 3, "ok", [], {}, input_metadata) fetch.fetch_or_wrap_error( self.ctx, self.chroot_context, self.basedir, WfModule(), MockModuleVersion(), {}, None, input_crr, self.output_path, ) # fetch is still called, with `None` as argument. self.assertIsNone( load_module.return_value.fetch.call_args[1]["input_parquet_filename"] )
def test_input_crr(self, downloaded_parquet_file, clean_value, load_module): load_module.return_value.migrate_params.return_value = {} load_module.return_value.fetch.return_value = FetchResult(self.output_path, []) clean_value.return_value = {} downloaded_parquet_file.return_value = Path("/path/to/x.parquet") input_metadata = TableMetadata(3, [Column("A", ColumnType.Text())]) input_crr = CachedRenderResult(1, 2, 3, "ok", [], {}, input_metadata) fetch.fetch_or_wrap_error( self.ctx, self.chroot_context, self.basedir, WfModule(), MockModuleVersion(), {}, None, input_crr, self.output_path, ) # Passed file is downloaded from rendercache downloaded_parquet_file.assert_called_with(input_crr, dir=self.basedir) self.assertEqual( load_module.return_value.fetch.call_args[1]["input_parquet_filename"], "x.parquet", ) # clean_value() is called with input metadata from CachedRenderResult clean_value.assert_called() self.assertEqual(clean_value.call_args[0][2], input_metadata)
def test_fetch_get_stored_dataframe_empty_file_is_empty_table(self): async def fetch(params, *, get_stored_dataframe): df = await get_stored_dataframe() assert_frame_equal(df, pd.DataFrame()) with tempfile_context(dir=self.basedir) as parquet_path: self._test_fetch(fetch, last_fetch_result=FetchResult(parquet_path, []))
def test_render_fetch_error(self): fetch_errors = [RenderError(I18nMessage("x", {"y": "z"}))] with tempfile_context() as empty_path: with self.render(P(), FetchResult(empty_path, fetch_errors)) as result: assert_arrow_table_equals(result.table, ArrowTable()) self.assertEqual(result.errors, fetch_errors)
def _stored_object_to_fetch_result( ctx: contextlib.ExitStack, stored_object: Optional[StoredObject], wf_module_fetch_error: str, dir: Path, ) -> Optional[FetchResult]: """ Given a StoredObject (or None), return a FetchResult (or None). This cannot error. Any errors lead to a `None` return value. """ if stored_object is None: return None else: try: last_fetch_path = ctx.enter_context( storedobjects.downloaded_file(stored_object, dir=dir)) if wf_module_fetch_error: errors = [ RenderError(I18nMessage.TODO_i18n(wf_module_fetch_error)) ] else: errors = [] return FetchResult(last_fetch_path, errors) except FileNotFoundError: return None
def test_render_empty_file_fetch_result_is_parquet(self): def render(*args, fetch_result): return fetch_result.dataframe with tempfile_context(dir=self.basedir) as tf: result = self._test_render(render, fetch_result=FetchResult(tf)) assert_render_result_equals(result, RenderResult(arrow_table({})))
def test_render_deprecated_parquet(self): with parquet_file({"A": [1, 2], "B": [3, 4]}) as fetched_path: with self.render(P(), FetchResult(fetched_path)) as result: assert_arrow_table_equals(result.table, { "A": [1, 2], "B": [3, 4] }) self.assertEqual(result.errors, [])
def test_render_with_parquet_fetch_result(self): def render(*args, fetch_result): return fetch_result with parquet_file({"A": ["fetched"]}, dir=self.basedir) as pf: result = self._test_render(render, fetch_result=FetchResult(pf)) assert_render_result_equals( result, RenderResult(arrow_table({"A": ["fetched"]})))
def fetch_arrow( params: Dict[str, Any], secrets: Dict[str, Any], last_fetch_result, input_table_parquet_path, output_path: Path, ) -> FetchResult: file_meta = params["file"] if not file_meta: return FetchResult( output_path, errors=[RenderError(I18nMessage.TODO_i18n("Please choose a file"))], ) # Ignore file_meta['url']. That's for the client's web browser, not for # an API request. sheet_id = file_meta["id"] if not sheet_id: # [adamhooper, 2019-12-06] has this ever happened? return FetchResult( output_path, errors=[RenderError(I18nMessage.TODO_i18n("Please choose a file"))], ) # backwards-compat for old entries without 'mimeType', 2018-06-13 sheet_mime_type = file_meta.get( "mimeType", "application/vnd.google-apps.spreadsheet" ) secret = secrets.get("google_credentials") if not secret: return TODO_i18n_fetch_error(output_path, "Please connect to Google Drive.") if "error" in secret: return FetchResult( output_path, errors=[RenderError(I18nMessage.from_dict(secret["error"]))] ) assert "secret" in secret oauth2_client = oauth2.Client( client_id=None, # unneeded token_type=secret["secret"]["token_type"], access_token=secret["secret"]["access_token"], ) return asyncio.run( do_download(sheet_id, sheet_mime_type, oauth2_client, output_path) )
def test_render_deprecated_parquet_warning(self): errors = [RenderError(I18nMessage.TODO_i18n("truncated table"))] with parquet_file({"A": [1, 2], "B": [3, 4]}) as fetched_path: with self.render(P(), FetchResult(fetched_path, errors)) as result: assert_arrow_table_equals(result.table, { "A": [1, 2], "B": [3, 4] }) self.assertEqual(result.errors, errors)
def test_render_with_non_parquet_fetch_result(self): def render(*args, fetch_result): return pd.DataFrame({"A": [fetch_result.path.read_text()]}) with tempfile_context(dir=self.basedir) as tf: tf.write_bytes(b"abcd") result = self._test_render(render, fetch_result=FetchResult(tf)) assert_render_result_equals( result, RenderResult(arrow_table({"A": ["abcd"]})))
def test_storage_limits(self, limit): workflow = Workflow.create_and_init() wf_module = workflow.tabs.first().wf_modules.create(order=0, slug="step-1") with parquet_file({"A": [1], "B": ["x"]}) as parquet_path: self.run_with_async_db( save.create_result(workflow.id, wf_module, FetchResult(parquet_path), timezone.now())) limit.assert_called_with(wf_module)
def test_render_empty_file_fetch_result_is_parquet(self): def render(table, params, *, fetch_result): assert_frame_equal(fetch_result.dataframe, pd.DataFrame({})) return fetch_result.dataframe with ModuleTestEnv(render=render) as env: with tempfile_context(dir=env.basedir) as tf: outcome = env.call_render(make_table(), {}, fetch_result=FetchResult(tf)) self.assertEqual(outcome.read_table(), make_table())
def test_fetch_get_stored_dataframe_unhandled_parquet_is_error(self): # Why an error? So module authors can handle it. They _created_ the # problem, after all. Let's help them detect it. async def fetch(params, *, get_stored_dataframe): with self.assertRaises(pa.ArrowIOError): await get_stored_dataframe() with tempfile_context(dir=self.basedir) as parquet_path: parquet_path.write_bytes(b"12345") self._test_fetch(fetch, last_fetch_result=FetchResult(parquet_path, []))
def _load_fetch_result( wf_module: WfModule, basedir: Path, exit_stack: contextlib.ExitStack ) -> Optional[FetchResult]: """ Download user-selected StoredObject to `basedir`, so render() can read it. Edge cases: Create no file (and return `None`) if the user did not select a StoredObject, or if the selected StoredObject does not point to a file on minio. The caller should ensure "leave `path` alone" means "return an empty FetchResult". The FetchResult may still have an error. """ try: stored_object = wf_module.stored_objects.get( stored_at=wf_module.stored_data_version ) except StoredObject.DoesNotExist: return None if not stored_object.bucket or not stored_object.key: return None with contextlib.ExitStack() as inner_stack: path = inner_stack.enter_context( tempfile_context(prefix="fetch-result-", dir=basedir) ) try: minio.download(stored_object.bucket, stored_object.key, path) # Download succeeded, so we no longer want to delete `path` # right _now_ ("now" means, "in inner_stack.close()"). Instead, # transfer ownership of `path` to exit_stack. exit_stack.callback(inner_stack.pop_all().close) except FileNotFoundError: # A few StoredObjects -- very old ones with size=0 -- are # *intentionally* not in minio. It turns out modules from that era # treated empty-file and None as identical. The _modules_ must # preserve that logic for backwards compatibility; so it's safe to # return `None` here. # # Other than that, if the file doesn't exist it's a race: either # the fetch result is too _new_ (it's in the database but its file # hasn't been written yet) or the fetch result is half-deleted (its # file was deleted and it's still in the database). In either case, # pretend the fetch result does not exist in the database -- i.e., # return `None`. return None if wf_module.fetch_error: errors = [RenderError(I18nMessage.TODO_i18n(wf_module.fetch_error))] else: errors = [] return FetchResult(path, errors)
def test_race_hard_deleted_wf_module(self): workflow = Workflow.create_and_init() wf_module = workflow.tabs.first().wf_modules.create(order=0, slug="step-1") WfModule.objects.filter(id=wf_module.id).delete() # Don't crash with parquet_file({"A": [1], "B": ["x"]}) as parquet_path: self.run_with_async_db( save.create_result(workflow.id, wf_module, FetchResult(parquet_path), timezone.now()))
def fetch_arrow( params: Dict[str, Any], secrets, last_fetch_result, input_table_parquet_path, output_path: Path, ) -> FetchResult: url: str = params["url"].strip() mimetypes = ",".join(v.value for v in AllowedMimeTypes) headers = [("Accept", mimetypes)] try: asyncio.run(httpfile.download(url, output_path, headers=headers)) except HttpError as err: # HACK: *err.i18n_message because i18n_message is a tuple # compatible with I18nMessage() ctor return FetchResult( output_path, errors=[RenderError(I18nMessage(*err.i18n_message))]) return FetchResult(output_path)