def test_default_render_returns_fetch_result(self): # Functionality used by libraryofcongress with ExitStack() as ctx: input_arrow_table = ctx.enter_context( arrow_table_context({"A": [1]}, dir=self.basedir) ) parquet_filename = Path( ctx.enter_context(parquet_file({"A": [2]}, dir=self.basedir)).name ).name out_filename = ctx.enter_context(tempfile_context(dir=self.basedir)).name thrift_result = module.render_thrift( ttypes.RenderRequest( str(self.basedir), input_arrow_table.to_thrift(), Params({}).to_thrift(), ttypes.Tab("tab-1", "Tab 1"), ttypes.FetchResult( parquet_filename, [RenderError(I18nMessage.TODO_i18n("A warning")).to_thrift()], ), out_filename, ) ) result = RenderResult.from_thrift(thrift_result, self.basedir) assert_render_result_equals( result, RenderResult( arrow_table({"A": [2]}), [RenderError(I18nMessage.TODO_i18n("A warning"))], ), )
def test_quick_fixes(self): err = PromptingError([ PromptingError.WrongColumnType(["A"], "text", frozenset({"number"})), PromptingError.WrongColumnType(["B", "C"], "datetime", frozenset({"number"})), ]) quick_fixes_result = err.as_quick_fixes() self.assertEqual( quick_fixes_result, [ QuickFix( I18nMessage.TODO_i18n("Convert Text to Numbers"), QuickFixAction.PrependStep("converttexttonumber", {"colnames": ["A"]}), ), QuickFix( I18nMessage.TODO_i18n("Convert Dates & Times to Numbers"), QuickFixAction.PrependStep("converttexttonumber", {"colnames": ["B", "C"]}), ), ], ) error_result = err.as_error_str() self.assertEqual( error_result, ("The column “A” must be converted from Text to Numbers.\n\n" "The columns “B” and “C” must be converted from Dates & Times to Numbers." ), )
def test_render_xlsx_bad_content(self): with tempfile_context("fetch-") as http_path: httpfile.write( http_path, {"url": "http://example.com/hello"}, "200 OK", [("content-type", XLSX_MIME_TYPE)], io.BytesIO("ceçi n'est pas une .xlsx".encode("utf-8")), ) result = render_arrow( ArrowTable(), P(has_header=True), "tab-x", FetchResult(http_path), self.output_path, ) self.assertEqual( result, RenderResult( ArrowTable(), [ RenderError( I18nMessage.TODO_i18n( 'Error reading Excel file: Unsupported format, or corrupt file: Expected BOF record; found b"ce\\xc3\\xa7i n\'"' ) ) ], ), )
def parse_csv( path: Path, *, output_path: Path, encoding: Optional[str], delimiter: Optional[str], has_header: bool, autoconvert_text_to_numbers: bool, ) -> RenderResult: result = _parse_csv( path, encoding=encoding, delimiter=delimiter, has_header=has_header, autoconvert_text_to_numbers=autoconvert_text_to_numbers, ) with pyarrow.ipc.RecordBatchFileWriter( output_path.as_posix(), schema=result.table.schema) as writer: writer.write_table(result.table) metadata = infer_table_metadata(result.table) if len(metadata.columns) == 0: arrow_table = ArrowTable() else: arrow_table = ArrowTable(output_path, result.table, metadata) if result.warnings: # TODO when we support i18n, this will be even simpler.... en_message = "\n".join([str(warning) for warning in result.warnings]) errors = [RenderError(I18nMessage.TODO_i18n(en_message))] else: errors = [] return RenderResult(arrow_table, errors)
def test_report_module_error(self): workflow = Workflow.create_and_init() tab = workflow.tabs.first() wf_module = tab.wf_modules.create( order=0, slug="step-1", module_id_name="x", last_relevant_delta_id=workflow.last_delta_id, ) def render(*args, fetch_result, **kwargs): raise ModuleExitedError(-9, "") with self._stub_module(render): result = self.run_with_async_db( execute_wfmodule( self.chroot_context, workflow, wf_module, {}, Tab(tab.slug, tab.name), RenderResult(), {}, self.output_path, )) self.assertEqual( result, RenderResult(errors=[ RenderError( I18nMessage.TODO_i18n( "Something unexpected happened. We have been notified and are " "working to fix it. If this persists, contact us. Error code: " "SIGKILL")) ]), )
def _stored_object_to_fetch_result( ctx: contextlib.ExitStack, stored_object: Optional[StoredObject], wf_module_fetch_error: str, dir: Path, ) -> Optional[FetchResult]: """ Given a StoredObject (or None), return a FetchResult (or None). This cannot error. Any errors lead to a `None` return value. """ if stored_object is None: return None else: try: last_fetch_path = ctx.enter_context( storedobjects.downloaded_file(stored_object, dir=dir)) if wf_module_fetch_error: errors = [ RenderError(I18nMessage.TODO_i18n(wf_module_fetch_error)) ] else: errors = [] return FetchResult(last_fetch_path, errors) except FileNotFoundError: return None
def test_fetch_nothing(self): with tempfile_context(prefix="output-") as output_path: result = fetch_arrow(P(file=None), {}, None, None, output_path) self.assertEqual( result.errors, [RenderError(I18nMessage.TODO_i18n("Please choose a file"))], )
def test_execute_migrate_params_module_error_gives_default_params(self): workflow = Workflow.create_and_init() tab = workflow.tabs.first() delta1 = workflow.last_delta create_module_zipfile( "mod", spec_kwargs={ "parameters": [{ "id_name": "x", "type": "string", "default": "def" }] }, python_code=textwrap.dedent(""" import json def render(table, params): return "params: " + json.dumps(params) def migrate_params(params): cause_module_error() # NameError """), ) wf_module = tab.wf_modules.create( order=0, slug="step-1", last_relevant_delta_id=delta1.id, module_id_name="mod", params={"x": "good"}, ) self._execute(workflow) wf_module.refresh_from_db() self.assertEqual( wf_module.cached_render_result_errors, [RenderError(I18nMessage.TODO_i18n('params: {"x": "def"}'))], )
def test_deleted_module(self): workflow = Workflow.create_and_init() tab = workflow.tabs.first() wf_module = tab.wf_modules.create( order=0, slug="step-1", module_id_name="deleted_module", last_relevant_delta_id=workflow.last_delta_id, ) result = self.run_with_async_db( execute_wfmodule( self.chroot_context, workflow, wf_module, {}, tab.to_arrow(), RenderResult(), {}, self.output_path, )) expected = RenderResult(errors=[ RenderError( I18nMessage.TODO_i18n( "Please delete this step: an administrator uninstalled its code." )) ]) self.assertEqual(result, expected) wf_module.refresh_from_db() self.assertEqual(wf_module.cached_render_result.errors, expected.errors)
def render(*args, fetch_result, **kwargs): self.assertEqual( fetch_result.errors, [RenderError(I18nMessage.TODO_i18n("maybe an error"))], ) assert_arrow_table_equals( pyarrow.parquet.read_table(str(fetch_result.path)), {"A": [1]}) return RenderResult()
def fetch_arrow( params: Dict[str, Any], secrets: Dict[str, Any], last_fetch_result, input_table_parquet_path, output_path: Path, ) -> FetchResult: file_meta = params["file"] if not file_meta: return FetchResult( output_path, errors=[RenderError(I18nMessage.TODO_i18n("Please choose a file"))], ) # Ignore file_meta['url']. That's for the client's web browser, not for # an API request. sheet_id = file_meta["id"] if not sheet_id: # [adamhooper, 2019-12-06] has this ever happened? return FetchResult( output_path, errors=[RenderError(I18nMessage.TODO_i18n("Please choose a file"))], ) # backwards-compat for old entries without 'mimeType', 2018-06-13 sheet_mime_type = file_meta.get( "mimeType", "application/vnd.google-apps.spreadsheet" ) secret = secrets.get("google_credentials") if not secret: return TODO_i18n_fetch_error(output_path, "Please connect to Google Drive.") if "error" in secret: return FetchResult( output_path, errors=[RenderError(I18nMessage.from_dict(secret["error"]))] ) assert "secret" in secret oauth2_client = oauth2.Client( client_id=None, # unneeded token_type=secret["secret"]["token_type"], access_token=secret["secret"]["access_token"], ) return asyncio.run( do_download(sheet_id, sheet_mime_type, oauth2_client, output_path) )
def test_render_deprecated_parquet_warning(self): errors = [RenderError(I18nMessage.TODO_i18n("truncated table"))] with parquet_file({"A": [1, 2], "B": [3, 4]}) as fetched_path: with self.render(P(), FetchResult(fetched_path, errors)) as result: assert_arrow_table_equals(result.table, { "A": [1, 2], "B": [3, 4] }) self.assertEqual(result.errors, errors)
def test_fetch_return_tuple_path_and_error(self): with tempfile_context(dir=self.basedir) as outfile: async def fetch(params): outfile.write_text("xyz") return outfile, "foo" result = self._test_fetch(fetch, output_filename=outfile.name) self.assertEqual(result.errors, [RenderError(I18nMessage.TODO_i18n("foo"))])
def test_render_truncate(self): def render(table, params): return pd.DataFrame({"A": [1, 2, 3]}) result = self._test_render(render) assert_arrow_table_equals(result.table, {"A": [1, 2]}) self.assertEqual( result.errors, [RenderError(I18nMessage.TODO_i18n("Truncated output from 3 rows to 2"))], )
def _load_fetch_result( wf_module: WfModule, basedir: Path, exit_stack: contextlib.ExitStack ) -> Optional[FetchResult]: """ Download user-selected StoredObject to `basedir`, so render() can read it. Edge cases: Create no file (and return `None`) if the user did not select a StoredObject, or if the selected StoredObject does not point to a file on minio. The caller should ensure "leave `path` alone" means "return an empty FetchResult". The FetchResult may still have an error. """ try: stored_object = wf_module.stored_objects.get( stored_at=wf_module.stored_data_version ) except StoredObject.DoesNotExist: return None if not stored_object.bucket or not stored_object.key: return None with contextlib.ExitStack() as inner_stack: path = inner_stack.enter_context( tempfile_context(prefix="fetch-result-", dir=basedir) ) try: minio.download(stored_object.bucket, stored_object.key, path) # Download succeeded, so we no longer want to delete `path` # right _now_ ("now" means, "in inner_stack.close()"). Instead, # transfer ownership of `path` to exit_stack. exit_stack.callback(inner_stack.pop_all().close) except FileNotFoundError: # A few StoredObjects -- very old ones with size=0 -- are # *intentionally* not in minio. It turns out modules from that era # treated empty-file and None as identical. The _modules_ must # preserve that logic for backwards compatibility; so it's safe to # return `None` here. # # Other than that, if the file doesn't exist it's a race: either # the fetch result is too _new_ (it's in the database but its file # hasn't been written yet) or the fetch result is half-deleted (its # file was deleted and it's still in the database). In either case, # pretend the fetch result does not exist in the database -- i.e., # return `None`. return None if wf_module.fetch_error: errors = [RenderError(I18nMessage.TODO_i18n(wf_module.fetch_error))] else: errors = [] return FetchResult(path, errors)
def test_duplicate_column_names_renamed(self): result = render_arrow(P(csv="A,A\na,b", has_header_row=True)) assert_arrow_table_equals(result.table, {"A": ["a"], "A 2": ["b"]}) self.assertEqual( result.errors, [ RenderError( I18nMessage.TODO_i18n( "Renamed 1 duplicate column names (see “A 2”)")) ], )
def test_fetch_return_error(self): async def fetch(params): return "bad things" with tempfile_context(dir=self.basedir) as outfile: result = self._test_fetch(fetch, output_filename=outfile.name) self.assertEqual(result.path, outfile) self.assertEqual( result.errors, [RenderError(I18nMessage.TODO_i18n("bad things"))]) self.assertEqual(outfile.read_bytes(), b"")
def test_not_found(self): self.mock_http_response = MockHttpResponse(404) with self.fetch(P(), secrets=secrets(DEFAULT_SECRET)) as result: self.assertEqual(result.path.read_bytes(), b"") self.assertEqual( result.errors, [ RenderError( I18nMessage.TODO_i18n( "File not found. Please choose a different file.")) ], )
async def prepare_secret_oauth1a(logic: ParamSpecSecret.Logic.Oauth1a, value: UserProvidedSecret) -> ModuleSecret: """ Prepare an OAuth1a secret for a module fetch() call. SECURITY: beware: we provide the module with our consumer secret. The module can masquerade as Workbench. The module will be able to authenticate with the provider as the end user, forever. A non-`None` UserProvidedSecret has a "secret" sub-dict with keys: * `oauth_token`: OAuth 1.0a access token provided by service for user. * `oauth_token_secret`: OAuth 1.0 access token provided by service for user. On success, ModuleSecret "secret" sub-dict will have keys: * `consumer_key`: for signing requests. * `consumer_secret`: for signing requests. * `resource_owner_key`: `oauth_token` (OAuth 1.0a access token) * `resource_owner_secret`: `oauth_token_secret` (OAuth 1.0a access token) Otherwise, ModuleSecret "error" value will be an I18nMessage-compatible dict describing the problem. All problems that may cause an "error": * After the user set a valid secret, Workbench was reconfigured and the provider was disabled. """ if not value: return None service: oauth.OAuth1 = oauth.OAuthService.lookup_or_none(logic.service) if not service: return _secret_error( value, I18nMessage.TODO_i18n("Service %r is no longer configured" % logic.service), ) return { **value, "secret": { "consumer_key": service.consumer_key, "consumer_secret": service.consumer_secret, "resource_owner_key": value.get("secret", {}).get("oauth_token", ""), "resource_owner_secret": value.get("secret", {}).get("oauth_token_secret", ""), }, }
def test_render_deprecated_parquet_warning(self): errors = [RenderError(I18nMessage.TODO_i18n("truncated table"))] with parquet_file({"A": [1, 2], "B": [3, 4]}) as fetched_path: result = render_arrow( ArrowTable(), P(), "tab-x", FetchResult(fetched_path, errors=errors), self.output_path, ) assert_arrow_table_equals(result.table, {"A": [1, 2], "B": [3, 4]}) self.assertEqual(result.errors, errors)
def test_invalid_auth_error(self): self.mock_http_response = MockHttpResponse(401) with self.fetch(P(), secrets=secrets(DEFAULT_SECRET)) as result: self.assertEqual(result.path.read_bytes(), b"") self.assertEqual( result.errors, [ RenderError( I18nMessage.TODO_i18n( "Invalid credentials. Please reconnect to Google Drive." )) ], )
def test_missing_secret_error(self): with self.fetch(P(), {}) as result: self.assertEqual(result.path.read_bytes(), b"") self.assertEqual( result.errors, [ RenderError( I18nMessage.TODO_i18n( "Please connect to Google Drive.")) ], ) # Should not make any request self.assertIsNone(self.last_http_requestline)
def user_visible_bug_fetch_result(output_path: Path, message: str) -> FetchResult: output_path.write_bytes(b"") return FetchResult( path=output_path, # empty errors=[ RenderError( I18nMessage.TODO_i18n( "Something unexpected happened. We have been notified and are " "working to fix it. If this persists, contact us. Error code: " + message)) ], )
def test_detect_unknown_file_extension(self): with _data_file(b"A,B\nx,y", suffix=".bin") as bin_path: result = parse_file(bin_path, output_path=self.output_path) assert_arrow_table_equals(result.table, {}) self.assertEqual( result.errors, [ RenderError( I18nMessage.TODO_i18n( "Unknown file extension '.bin'. Please try a different file." )) ], )
def test_missing_secret_error(self): with tempfile_context() as output_path: result = fetch_arrow(P(), secrets(None), None, None, output_path) self.assertEqual(result.path.read_bytes(), b"") self.assertEqual( result.errors, [ RenderError( I18nMessage.TODO_i18n( "Please connect to Google Drive.")) ], ) # Should not make any request self.assertIsNone(self.last_http_requestline)
def test_no_access_error(self): self.mock_http_response = MockHttpResponse(403) with self.fetch(P(), secrets=secrets(DEFAULT_SECRET)) as result: self.assertEqual(result.path.read_bytes(), b"") self.assertEqual( result.errors, [ RenderError( I18nMessage.TODO_i18n( "You chose a file your logged-in user cannot access. " "Please reconnect to Google Drive or choose a different file." )) ], )
def test_fetch_http_404(self): self.mock_http_response = MockHttpResponse(404, [("Content-Length", 0)]) url = self.build_url("/not-found") with self.fetch(url) as result: self.assertEqual(result.path.read_bytes(), b"") self.assertEqual( result.errors, [ RenderError( I18nMessage.TODO_i18n( "Error from server: HTTP 404 Not Found")) ], )
def _wrap_render_errors(render_call): try: return render_call() except ModuleError as err: return RenderResult( errors=[ RenderError( I18nMessage.TODO_i18n( "Something unexpected happened. We have been notified and are " "working to fix it. If this persists, contact us. Error code: " + format_for_user_debugging(err) ) ) ] )
def parse_file( path: Path, *, output_path: Path, encoding: Optional[str] = None, mime_type: Optional[MimeType] = None, has_header: bool = True, ) -> RenderResult: if mime_type is None: ext = "".join(path.suffixes).lower() try: mime_type = MimeType.from_extension(ext) except KeyError: return RenderResult(errors=[ RenderError( I18nMessage.TODO_i18n( "Unknown file extension %r. Please try a different file." % ext)) ]) if mime_type in {MimeType.CSV, MimeType.TSV, MimeType.TXT}: delimiter: Optional[str] = { MimeType.CSV: ",", MimeType.TSV: "\t", MimeType.TXT: None, }[mime_type] return parse_csv( path, output_path=output_path, encoding=encoding, delimiter=delimiter, has_header=has_header, autoconvert_text_to_numbers=True, ) elif mime_type == MimeType.JSON: return parse_json(path, output_path=output_path, encoding=encoding) elif mime_type == MimeType.XLS: return parse_xls_file(path, output_path=output_path, has_header=has_header, autoconvert_types=True) elif mime_type == MimeType.XLSX: return parse_xlsx_file(path, output_path=output_path, has_header=has_header, autoconvert_types=True) else: raise RuntimeError("Unhandled MIME type")
def test_fetch_truncate(self): def fetch(params): return pd.DataFrame({"A": [1, 2, 3]}) with tempfile_context(dir=self.basedir) as outfile: result = self._test_fetch(fetch, output_filename=outfile.name) self.assertEqual( result.errors, [ RenderError( I18nMessage.TODO_i18n("Truncated output from 3 rows to 2") ) ], ) arrow_table = pa.parquet.read_table(str(outfile), use_threads=False) assert_arrow_table_equals(arrow_table, {"A": [1, 2]})