def import_zipfile(path: Path) -> clientside.Module: """ Save a zipfile to database and minio and build a `clientside.Module`. Raise `WorkbenchModuleImportError` if `path` points to an invalid module. Otherwise, do not raise any errors one can sensibly recover from. """ temp_zipfile = ModuleZipfile(path) validate_zipfile(temp_zipfile) # raise WorkbenchModuleImportError module_id = temp_zipfile.module_id version = temp_zipfile.version module_spec = temp_zipfile.get_spec() js_module = temp_zipfile.get_optional_js_module() or "" minio.fput_file(minio.ExternalModulesBucket, "%s/%s" % (module_id, path.name), path) ModuleVersion.objects.update_or_create( id_name=module_id, source_version_hash=version, spec=asdict(temp_zipfile.get_spec()), js_module=js_module, ) return clientside.Module(module_spec, js_module)
def get_migrated_params( wf_module: WfModule, *, module_zipfile: ModuleZipfile = None) -> Dict[str, Any]: """ Read `wf_module.params`, calling migrate_params() or using cache fields. Call this within a `Workflow.cooperative_lock()`. If migrate_params() was already called for this version of the module, return the cached value. See `wf_module.cached_migrated_params`, `wf_module.cached_migrated_params_module_version`. Raise `ModuleError` if migration fails. Raise `KeyError` if the module was deleted. Raise `RuntimeError` (unrecoverable) if there is a problem loading or executing the module. (Modules are validated before import, so this should not happen.) The result may be invalid. Call `validate()` to raise a `ValueError` to detect that case. TODO avoid holding the database lock whilst executing stuff on the kernel. (This will involve auditing and modifying all callers to handle new error cases.) """ if module_zipfile is None: # raise KeyError module_zipfile = MODULE_REGISTRY.latest(wf_module.module_id_name) stale = ( module_zipfile.version == "develop" # works if cached version (and thus cached _result_) is None or (module_zipfile.get_param_schema_version() != wf_module.cached_migrated_params_module_version)) if not stale: return wf_module.cached_migrated_params else: # raise ModuleError params = invoke_migrate_params(module_zipfile, wf_module.params) wf_module.cached_migrated_params = params wf_module.cached_migrated_params_module_version = ( module_zipfile.get_param_schema_version()) try: wf_module.save(update_fields=[ "cached_migrated_params", "cached_migrated_params_module_version", ]) except ValueError: # WfModule was deleted, so we get: # "ValueError: Cannot force an update in save() with no primary key." pass return params
def download_module_zipfile( tempdir: Path, module_id: ModuleId, version: ModuleVersion, *, deprecated_spec: Dict[str, Any], deprecated_js_module: str, ) -> ModuleZipfile: """ Produce a local-path ModuleZipfile by downloading from minio. Raise `RuntimeError` (_from_ another kind of error -- `FileNotFoundError`, `KeyError`, `ValueError`, `SyntaxError`, `BadZipFile`, `UnicodeDecodeError` or more) if the zipfile is not a valid Workbench module. We spend the time testing the zipfile for validity because A) it's good to catch errors quickly; and B) fetcher, renderer and server all need to execute code on each module, so they're destined to validate the module anyway. The zipfile is always written to "{tempdir}/{module_id}.{version}.zip". This function is not re-entrant when called with the same parameters. Callers may use locks to avoid trying to download the same data multiple times. """ logger.info("download_module_zipfile(%s.%s.zip)", module_id, version) zippath = tempdir / ("%s.%s.zip" % (module_id, version)) try: _download_module_zipfile_modern(zippath, module_id, version) except FileNotFoundError as original_error: try: _download_module_zipfile_deprecated( zippath, module_id, version, spec=deprecated_spec, js_module=deprecated_js_module, ) except FileNotFoundError: raise RuntimeError from original_error ret = ModuleZipfile(zippath) # raise ZipfileError try: # raise KeyError or SyntaxError compiled_module = ret.compile_code_without_executing() ret.get_spec() # raise KeyError or ValueError cjwstate.modules.kernel.validate(compiled_module) # raise ModuleError except Exception as err: raise RuntimeError from err return ret
def invoke_migrate_params( module_zipfile: ModuleZipfile, raw_params: Dict[str, Any] ) -> Dict[str, Any]: """Call module `migrate_params()` using (global) kernel. Raise ModuleError if module code did not execute. The result may not be valid. Call `param_schema.validate(result)` to raise `ValueError` on error; or call `param_schema.coerce(result)` to guarantee a valid result. Log any ModuleError. Also log success. """ time1 = time.time() logger.info("%s:migrate_params() begin", module_zipfile.path.name) status = "???" try: result = cjwstate.modules.kernel.migrate_params( module_zipfile.compile_code_without_executing(), raw_params ) # raise ModuleError status = "ok" return result except ModuleError as err: logger.exception("Exception in %s:migrate_params()", module_zipfile.path.name) status = type(err).__name__ raise finally: time2 = time.time() logger.info( "%s:migrate_params() => %s in %dms", module_zipfile.path.name, status, int((time2 - time1) * 1000), )
def validate_zipfile(module_zipfile: ModuleZipfile) -> None: """Ensure `path` points to a valid ModuleZipfile. Raise `WorkbenchModuleImportError` with an English-language description of the flaw otherwise. (This can help module authors fix their mistakes.) """ try: module_zipfile.get_spec() # raise KeyError, ValueError, BadZipFile # raise KeyError, UnicodeDecodeError, SyntaxError, BadZipFile compiled_module = module_zipfile.compile_code_without_executing() cjwstate.modules.kernel.validate(compiled_module) # raise ModuleError module_zipfile.get_optional_html() # raise UnicodeError, BadZipFile module_zipfile.get_optional_js_module() # raise UnicodeError, BadZipFile except zipfile.BadZipFile as err: raise WorkbenchModuleImportError("Bad zipfile: %s" % str(err)) from err except ValueError as err: raise WorkbenchModuleImportError( "Module .yaml is invalid: %s" % str(err) ) from err except KeyError as err: raise WorkbenchModuleImportError( "Zipfile is missing a required file: %s" % str(err) ) from err except SyntaxError as err: raise WorkbenchModuleImportError( "Module Python code has a syntax error: %s" % str(err) ) from err except UnicodeError as err: raise WorkbenchModuleImportError( "Module Python, HTML or JS code is invalid UTF-8: %s" % str(err) ) from err except ModuleError as err: raise WorkbenchModuleImportError( "Module Python code failed to run: %s" % str(err) ) from err
def invoke_fetch( module_zipfile: ModuleZipfile, *, chroot_context: ChrootContext, basedir: Path, params: Params, secrets: Dict[str, Any], last_fetch_result: Optional[FetchResult], input_parquet_filename: Optional[str], output_filename: str, ) -> FetchResult: """ Use kernel to invoke module `fetch(...)` method and build a `FetchResult`. Raise `ModuleError` on error. (This is usually the module author's fault.) Log any ModuleError. Also log success. This synchronous method can be slow for complex modules, large datasets or slow network requests. Consider calling it from an executor. """ time1 = time.time() status = "???" logger.info("%s:fetch() begin", module_zipfile.path.name) compiled_module = module_zipfile.compile_code_without_executing() try: ret = cjwstate.modules.kernel.fetch( compiled_module=compiled_module, chroot_context=chroot_context, basedir=basedir, params=params, secrets=secrets, last_fetch_result=last_fetch_result, input_parquet_filename=input_parquet_filename, output_filename=output_filename, ) status = "%0.1fMB" % (ret.path.stat().st_size / 1024 / 1024) return ret except ModuleError as err: logger.exception("Exception in %s:fetch", module_zipfile.path.name) status = type(err).__name__ raise finally: time2 = time.time() logger.info( "%s:fetch() => %s in %dms", module_zipfile.path.name, status, int((time2 - time1) * 1000), )
def _execute_step_pre( *, basedir: Path, exit_stack: contextlib.ExitStack, workflow: Workflow, step: Step, module_zipfile: ModuleZipfile, raw_params: Dict[str, Any], input_path: Path, input_table_columns: List[Column], tab_results: Dict[Tab, Optional[StepResult]], ) -> ExecuteStepPreResult: """First step of execute_step(). Raise TabCycleError or TabOutputUnreachableError if the module depends on tabs with errors. Raise NoLoadedDataError if there is no input table and the module's loads_data is False (the default). Raise PromptingError if the module parameters are invalid. Raise UnneededExecution if `step` has changed. (We won't call the render() method in any of these cases.) All this runs synchronously within a database lock. (It's a separate function so that when we're done awaiting it, we can continue executing in a context that doesn't use a database thread.) `tab_results.keys()` must be ordered as the Workflow's tabs are. """ # raises UnneededExecution with locked_step(workflow, step) as safe_step: fetch_result = _load_fetch_result(safe_step, basedir, exit_stack) module_spec = module_zipfile.get_spec() if not module_spec.loads_data and not input_table_columns: raise NoLoadedDataError # raise TabCycleError, TabOutputUnreachableError, PromptingError params, tab_outputs, uploaded_files = renderprep.prep_params( params=raw_params, schema=module_spec.param_schema, step_id=step.id, input_table_columns=input_table_columns, tab_results=tab_results, basedir=basedir, exit_stack=exit_stack, ) return ExecuteStepPreResult(fetch_result, params, tab_outputs, uploaded_files)
def extract_module_messages(directory: pathlib.Path): with directory_loaded_as_zipfile_path(directory) as zip_path: module_zipfile = ModuleZipfile(zip_path) # may be invalid source_catalog = _build_source_catalog(module_zipfile) po_path = _po_path(directory, default_locale) try: old_source_catalog = read_po_catalog(po_path) except FileNotFoundError: old_source_catalog = Catalog(default_locale) # Update file for default locale if not catalogs_are_same(source_catalog, old_source_catalog): write_po_catalog(po_path, source_catalog) # Update template catalog # We will have no specific locale in the template catalog template_catalog = copy_catalog(source_catalog, locale=None) move_strings_to_comments(template_catalog, comment_tag="default-message") pot_path = _pot_path(directory) try: old_template_catalog = read_po_catalog(pot_path) except FileNotFoundError: old_template_catalog = Catalog() if not catalogs_are_same(template_catalog, old_template_catalog): write_po_catalog( pot_path, template_catalog, ignore_obsolete=True, width= 10000000, # we set a huge value for width, so that special comments do not wrap omit_header= True, # removes locale and other info from the output file ) fuzzy = find_fuzzy_messages(old_catalog=old_source_catalog, new_catalog=source_catalog) for locale_id in supported_locales: if locale_id != default_locale: po_path = _po_path(directory, locale_id) try: old_catalog = read_po_catalog(po_path) except FileNotFoundError: old_catalog = Catalog(locale_id) catalog = _merge_nonsource_catalog(locale_id, old_catalog, source_catalog, fuzzy) if not catalogs_are_same(catalog, old_catalog): write_po_catalog(po_path, catalog)
def _execute_wfmodule_pre( basedir: Path, exit_stack: contextlib.ExitStack, workflow: Workflow, wf_module: WfModule, module_zipfile: ModuleZipfile, raw_params: Dict[str, Any], input_table: ArrowTable, tab_results: Dict[Tab, Optional[RenderResult]], ) -> ExecuteStepPreResult: """ First step of execute_wfmodule(). Raise TabCycleError or TabOutputUnreachableError if the module depends on tabs with errors. (We won't call the render() method in that case.) Raise PromptingError if the module parameters are invalid. (We'll skip render() and prompt the user with quickfixes in that case.) Raise UnneededExecution if `wf_module` has changed. All this runs synchronously within a database lock. (It's a separate function so that when we're done awaiting it, we can continue executing in a context that doesn't use a database thread.) `tab_results.keys()` must be ordered as the Workflow's tabs are. """ # raises UnneededExecution with locked_wf_module(workflow, wf_module) as safe_wf_module: fetch_result = _load_fetch_result(safe_wf_module, basedir, exit_stack) module_spec = module_zipfile.get_spec() param_schema = module_spec.get_param_schema() render_context = renderprep.RenderContext( wf_module.id, input_table, tab_results, basedir, exit_stack, raw_params, # ugh ) # raise TabCycleError, TabOutputUnreachableError, PromptingError params = renderprep.get_param_values(param_schema, raw_params, render_context) return ExecuteStepPreResult(fetch_result, params)
def _build_source_catalog(module_zipfile: ModuleZipfile) -> Catalog: source_catalog = Catalog(default_locale) spec = module_zipfile.get_spec() for message_id, source_string in find_spec_messages(spec).items(): source_catalog.add(message_id, string=source_string) with zipfile.ZipFile(module_zipfile.path, mode="r") as zf: for info in zf.infolist(): if info.filename.endswith(".py"): with zf.open(info) as code_io: for message_id, message_properties in find_messages_in_module_code( code_io, info.filename).items(): source_catalog.add( message_id, string=message_properties["string"], auto_comments=message_properties["comments"], locations=message_properties["locations"], ) return source_catalog
def _get_migrated_params(wf_module: WfModule, module_zipfile: ModuleZipfile) -> Dict[str, Any]: """ Build the Params dict which will be passed to render(). Call LoadedModule.migrate_params() to ensure the params are up-to-date. On ModuleError or ValueError, log the error and return default params. This will render the "wrong" thing ... but the front-end should show the migrate error (as it's rendering the form) so users should figure out the problem. (What's the alternative? Abort the whole workflow render? We can't render _any_ module until we've migrated _all_ modules; and it's hard to imagine showing the user a huge, aborted render.) Assume we are called within a `workflow.cooperative_lock()`. """ if module_zipfile is None: # This is a deleted module. Renderer will pass the input through to # the output. return {} module_spec = module_zipfile.get_spec() param_schema = module_spec.get_param_schema() try: result = get_migrated_params(wf_module, module_zipfile=module_zipfile) except ModuleError: # LoadedModule logged this error; no need to log it again. return param_schema.coerce(None) # Is the module buggy? It might be. Log that error, and return a valid # set of params anyway -- even if it isn't the params the user wants. try: param_schema.validate(result) return result except ValueError as err: logger.exception( "%s:migrate_params() gave wrong retval: %s", module_zipfile.path.name, str(err), ) return param_schema.coerce(result)
def _create_localizer_for_module_zipfile( cls, module_zipfile: ModuleZipfile ) -> Optional[MessageLocalizer]: catalogs = {} for locale_id in supported_locales: try: catalogs[locale_id] = read_po( BytesIO(module_zipfile.read_messages_po_for_locale(locale_id)), abort_invalid=True, ) except PoFileError as err: logger.exception( "Invalid po file for module %s in locale %s: %s", module_zipfile.module_id_and_version, locale_id, err, ) pass except KeyError: pass if not catalogs: return None return MessageLocalizer(catalogs)
def fetch_or_wrap_error( ctx: contextlib.ExitStack, chroot_context: ChrootContext, basedir: Path, module_id_name: str, module_zipfile: ModuleZipfile, migrated_params_or_error: Union[Dict[str, Any], ModuleError], secrets: Dict[str, Any], last_fetch_result: Optional[FetchResult], maybe_input_crr: Optional[CachedRenderResult], output_path: Path, ): """ Fetch, and do not raise any exceptions worth catching. Exceptions are wrapped -- the result is a FetchResult with `.errors`. This function is slow indeed. Perhaps call it from EventLoop.run_in_executor(). (Why not make it async? Because all the logic inside -- compile module, fetch() -- is sandboxed, meaning it gets its own processes. We may eventually avoid asyncio entirely in `fetcher`. These problems are all handled: * Module was deleted (`module_zipfile is None`) * Module times out (`cjwkernel.errors.ModuleTimeoutError`), in `fetch()`. * Module crashes (`cjwkernel.errors.ModuleExitedError`), in `fetch()`. * migrated_params_or_error is a `ModuleError` * migrated_params_or_error is invalid (`ValueError`) * input_crr points to a nonexistent file (`FileNotFoundError`) """ # module_zipfile=None is allowed if module_zipfile is None: logger.info("fetch() deleted module '%s'", module_id_name) return FetchResult( output_path, [ RenderError( I18nMessage.trans( "py.fetcher.fetch.no_loaded_module", default="Cannot fetch: module was deleted", )) ], ) module_spec = module_zipfile.get_spec() param_schema = module_spec.get_param_schema() if isinstance(migrated_params_or_error, ModuleError): # raise the exception so we can log it try: raise migrated_params_or_error except ModuleError: # We'll always get here logger.exception("%s:migrate_params() raised error", module_zipfile.path.name) return user_visible_bug_fetch_result( output_path, format_for_user_debugging(migrated_params_or_error)) migrated_params = migrated_params_or_error try: param_schema.validate(migrated_params) except ValueError: logger.exception("Invalid return value from %s:migrate_params()", module_zipfile.path.name) return user_visible_bug_fetch_result( output_path, "%s:migrate_params() output invalid params" % module_zipfile.path.name, ) # get input_metadata, input_parquet_path. (This can't error.) input_parquet_path, input_metadata = _download_cached_render_result( ctx, maybe_input_crr, dir=basedir) # Clean params, so they're of the correct type. (This can't error.) params = Params( fetchprep.clean_value(param_schema, migrated_params, input_metadata)) # actually fetch try: return invoke_fetch( module_zipfile, chroot_context=chroot_context, basedir=basedir, params=params, secrets=secrets, last_fetch_result=last_fetch_result, input_parquet_filename=(None if input_parquet_path is None else input_parquet_path.name), output_filename=output_path.name, ) except ModuleError as err: logger.exception("Error calling %s:fetch()", module_zipfile.path.name) return user_visible_bug_fetch_result(output_path, format_for_user_debugging(err))
def invoke_render( module_zipfile: ModuleZipfile, *, chroot_context: ChrootContext, basedir: Path, input_filename: Optional[str], params: Dict[str, Any], tab_name: str, fetch_result: Optional[FetchResult], tab_outputs: Dict[str, TabOutput], uploaded_files: Dict[str, UploadedFile], output_filename: str, ) -> LoadedRenderResult: """Use kernel to process `table` with module `render` function. Raise `ModuleError` on error. (This is usually the module author's fault.) Log any ModuleError. Also log success. This synchronous method can be slow for complex modules or large datasets. Consider calling it from an executor. """ time1 = time.time() begin_status_format = "%s:render() (%0.1fMB input)" begin_status_args = ( module_zipfile.path.name, ( (basedir / input_filename).stat().st_size / 1024 / 1024 if input_filename is not None else 0 ), ) logger.info(begin_status_format + " begin", *begin_status_args) status = "???" try: result = cjwstate.modules.kernel.render( module_zipfile.compile_code_without_executing(), chroot_context=chroot_context, basedir=basedir, input_filename=input_filename, params=params, tab_name=tab_name, fetch_result=fetch_result, tab_outputs=tab_outputs, uploaded_files=uploaded_files, output_filename=output_filename, ) output_path = basedir / output_filename st_size = output_path.stat().st_size if st_size == 0: table = pa.table({}) columns = [] status = "(no output)" else: try: table, columns = load_untrusted_arrow_file_with_columns(output_path) status = "(%drows, %dcols, %0.1fMB)" % ( table.num_rows, table.num_columns, st_size / 1024 / 1024, ) except ValidateError as err: raise ModuleExitedError( module_zipfile.path.name, 0, "Module wrote invalid data: %s" % str(err), ) return LoadedRenderResult( path=output_path, table=table, columns=columns, errors=result.errors, json=result.json, ) except ModuleError as err: logger.exception("Exception in %s:render", module_zipfile.path.name) status = type(err).__name__ raise finally: time2 = time.time() logger.info( begin_status_format + " => %s in %dms", *begin_status_args, status, int((time2 - time1) * 1000), )
def invoke_render( module_zipfile: ModuleZipfile, *, chroot_context: ChrootContext, basedir: Path, input_table: ArrowTable, params: Params, tab: Tab, fetch_result: Optional[FetchResult], output_filename: str, ) -> RenderResult: """ Use kernel to process `table` with module `render` function. Raise `ModuleError` on error. (This is usually the module author's fault.) Log any ModuleError. Also log success. This synchronous method can be slow for complex modules or large datasets. Consider calling it from an executor. """ time1 = time.time() begin_status_format = "%s:render() (%d rows, %d cols, %0.1fMB)" begin_status_args = ( module_zipfile.path.name, input_table.metadata.n_rows, len(input_table.metadata.columns), input_table.n_bytes_on_disk / 1024 / 1024, ) logger.info(begin_status_format + " begin", *begin_status_args) status = "???" try: result = cjwstate.modules.kernel.render( module_zipfile.compile_code_without_executing(), chroot_context=chroot_context, basedir=basedir, input_table=input_table, params=params, tab=tab, fetch_result=fetch_result, output_filename=output_filename, ) status = "(%drows, %dcols, %0.1fMB)" % ( result.table.metadata.n_rows, len(result.table.metadata.columns), result.table.n_bytes_on_disk / 1024 / 1024, ) return result except ModuleError as err: logger.exception("Exception in %s:render", module_zipfile.path.name) status = type(err).__name__ raise finally: time2 = time.time() logger.info( begin_status_format + " => %s in %dms", *begin_status_args, status, int((time2 - time1) * 1000), )