def print_module_info(module_types, module_names, snake_storage, reverse_config_usage): """Wrap module printing functions: print correct info for chosen module_types and module_names.""" all_module_types = { "annotators": snake_storage.all_annotators, "importers": snake_storage.all_importers, "exporters": snake_storage.all_exporters, "installers": snake_storage.all_installers } if not module_types or "all" in module_types: module_types = all_module_types.keys() module_names = [n.lower() for n in module_names] # Print module info for all chosen module_types if not module_names: for module_type in module_types: modules = all_module_types.get(module_type) print_modules(modules, module_type, reverse_config_usage, snake_storage) # Print only info for chosen module_names else: invalid_modules = module_names for module_type in module_types: modules = all_module_types.get(module_type) modules = dict((k, v) for k, v in modules.items() if k in module_names) if modules: invalid_modules = [m for m in invalid_modules if m not in modules.keys()] print_modules(modules, module_type, reverse_config_usage, snake_storage) if invalid_modules: console.print("[red]Module{} not found: {}[/red]".format("s" if len(invalid_modules) > 1 else "", ", ".join(invalid_modules)))
def prettyprint_yaml(in_dict): """Pretty-print YAML.""" from rich.syntax import Syntax import yaml class MyDumper(yaml.Dumper): """Customized YAML dumper that indents lists.""" def increase_indent(self, flow=False, indentless=False): """Force indentation.""" return super(MyDumper, self).increase_indent(flow) # Resolve aliases and replace them with their anchors' contents yaml.Dumper.ignore_aliases = lambda *args: True yaml_str = yaml.dump(in_dict, default_flow_style=False, Dumper=MyDumper, indent=4, allow_unicode=True) # Print syntax highlighted console.print(Syntax(yaml_str, "yaml"))
def print_annotation_classes(): """Print info about annotation classes.""" print() table = Table(title="Available annotation classes", box=box.SIMPLE, show_header=False, title_justify="left") table.add_column(no_wrap=True) table.add_column() table.add_row("[b]Defined by pipeline modules[/b]") table.add_row(" [i]Class[/i]", "[i]Annotation[/i]") for annotation_class, anns in registry.annotation_classes["module_classes"].items(): table.add_row(" " + annotation_class, "\n".join(anns)) if registry.annotation_classes["config_classes"]: table.add_row() table.add_row("[b]From config[/b]") table.add_row(" [i]Class[/i]", "[i]Annotation[/i]") for annotation_class, ann in registry.annotation_classes["config_classes"].items(): table.add_row(" " + annotation_class, ann) console.print(table)
def print_module_summary(snake_storage): """Print a summary of all annotation modules.""" all_module_types = { "annotators": snake_storage.all_annotators, "importers": snake_storage.all_importers, "exporters": snake_storage.all_exporters, "installers": snake_storage.all_installers } print() table = Table(title="Available modules", box=box.SIMPLE, show_header=False, title_justify="left") table.add_column(no_wrap=True) table.add_column() for module_type, modules in all_module_types.items(): table.add_row(f"[b]{module_type.upper()}[/b]") for module_name in sorted(modules.keys()): description = registry.modules[module_name].description or "" if module_name.startswith("custom."): description = get_custom_module_description(module_name) table.add_row(" " + module_name, description) table.add_row() console.print(table) console.print("For more details about a specific module run [green]'sparv modules \\[module name]'[/green].", highlight=False) console.print( "For more details about all modules of a specific type run [green]'sparv modules --\\[module type]'[/green].", highlight=False)
def copy_resource_files(data_dir: pathlib.Path): """Copy resource files to data dir.""" resources_dir = pathlib.Path( pkg_resources.resource_filename("sparv", "resources")) for f in resources_dir.rglob("*"): rel_f = f.relative_to(resources_dir) if f.is_dir(): (data_dir / rel_f).mkdir(parents=True, exist_ok=True) else: # Check if file already exists in data dir if (data_dir / rel_f).is_file(): # Only copy if files are different if not filecmp.cmp(f, (data_dir / rel_f)): shutil.copy( (data_dir / rel_f), (data_dir / rel_f.parent / (rel_f.name + ".bak"))) console.print( f"{rel_f} has been updated and a backup was created") shutil.copy(f, data_dir / rel_f) else: shutil.copy(f, data_dir / rel_f)
def error(msg): """Print error message.""" console.print(Text(msg, style="red"))
def warning(msg): """Print warning message.""" console.print(Text(msg, style="yellow"))
def info(msg): """Print info message.""" console.print(Text(msg, style="green"))
def run(sparv_datadir: Optional[str] = None): """Query user about data dir path unless provided by argument, and populate path with files.""" default_dir = pathlib.Path(appdirs.user_data_dir("sparv")) current_dir = paths.get_data_path() path: pathlib.Path using_env = bool(os.environ.get(paths.data_dir_env)) if sparv_datadir: # Specifying a path on the command line will perform the setup using that path, even if the environment # variable is set using_env = False path = pathlib.Path(sparv_datadir) else: console.print( "\n[b]Sparv Data Directory Setup[/b]\n\n" f"Current data directory: [green]{current_dir or '<not set>'}[/green]\n\n" "Sparv needs a place to store its configuration files, language models and other data. " "After selecting the directory you want to use for this purpose, Sparv will populate it with a default " "config file and presets. Any existing files in the target directory will be backed up. Any previous " "backups will be overwritten.") console.print( Padding( "[b]Tip:[/b] This process can also be completed non-interactively. Run 'sparv setup --help' for details. " f"You may also override the data directory setting using the environment variable '{paths.data_dir_env}'.", (1, 4))) if using_env: try: cont = Confirm.ask( f"[b red]NOTE:[/b red] Sparv's data directory is currently set to '{current_dir}' using the " f"environment variable '{paths.data_dir_env}'. This variable takes precedence over any previous " f"path set using this setup process. To change the path, either edit the environment variable, or " f"delete the variable and rerun the setup command.\n" "Do you want to continue the setup process using the above path?" ) except KeyboardInterrupt: console.print("\nSetup interrupted.") sys.exit() if not cont: console.print("\nSetup aborted.") sys.exit() path = current_dir else: # Ask user for path if current_dir: msg = f" Leave empty to continue using '{current_dir}':" else: msg = f" Leave empty to use the default which is '{default_dir}':" try: console.print( f"Enter the path to the directory you want to use.{msg}") path_str = input().strip() except KeyboardInterrupt: console.print("\nSetup interrupted.") sys.exit() if path_str: path = pathlib.Path(path_str) else: if current_dir: path = current_dir else: path = default_dir try: # Expand any "~" path = path.expanduser() # Create directories dirs = [ paths.bin_dir.name, paths.config_dir.name, paths.models_dir.name ] path.mkdir(parents=True, exist_ok=True) for d in dirs: (path / d).mkdir(exist_ok=True) except: console.print( "\nAn error occurred while trying to create the directories. " "Make sure the path you entered is correct, and that you have the necessary read/write permissions." ) sys.exit(1) if not using_env: # Save data dir setting to config file config_dict = {"sparv_data": str(path)} paths.sparv_config_file.parent.mkdir(parents=True, exist_ok=True) with open(paths.sparv_config_file, "w") as f: yaml.dump(config_dict, f) copy_resource_files(path) # Save Sparv version number to a file in data dir (path / VERSION_FILE).write_text(__version__) console.print( f"\nSetup completed. The Sparv data directory is set to '{path}'.")
def print_error(msg: str): """Format msg into an error message.""" console.print(f"[red]\n{msg}[/red]", highlight=False)
def print_modules(modules: dict, module_type: str, reverse_config_usage: dict, snake_storage: snake_utils.SnakeStorage, print_params: bool = False): """Print module information.""" custom_annotations = snake_storage.all_custom_annotators # Box styles left_line = box.Box(" \n┃ \n┃ \n┃ \n┃ \n┃ \n┃ \n ") minimal = box.Box(" \n │ \n╶─┼╴\n │ \n╶─┼╴\n╶─┼╴\n │ \n \n") box_style = minimal # Module type header print() console.print(f" [b]{module_type.upper()}[/b]", style="reverse", justify="left") # 'justify' to fill entire width print() for i, module_name in enumerate(sorted(modules)): if i: console.print(Rule()) # Module name header console.print(f"\n[bright_black]:[/][dim]:[/]: [b]{module_name.upper()}[/b]\n") # Module description description = None if registry.modules[module_name].description: description = registry.modules[module_name].description elif module_name.startswith("custom."): description = get_custom_module_description(module_name) if description: console.print(Padding(description, (0, 4, 1, 4))) for f_name in sorted(modules[module_name]): # Function name and description f_desc = modules[module_name][f_name]["description"] console.print(Padding(Panel(f"[b]{f_name.upper()}[/b]\n[i]{f_desc}[/i]", box=left_line, padding=(0, 1), border_style="bright_green"), (0, 2))) # Get parameters. Always print these for custom annotations params = modules[module_name][f_name].get("params", {}) custom_params = None if custom_annotations.get(module_name, {}).get(f_name): custom_params = custom_annotations[module_name][f_name].get("params", {}) params = custom_params # Annotations f_anns = modules[module_name][f_name].get("annotations", {}) if f_anns: this_box_style = box_style if any(a[1] for a in f_anns) else box.SIMPLE table = Table(title="[b]Annotations[/b]", box=this_box_style, show_header=False, title_justify="left", padding=(0, 2), pad_edge=False, border_style="bright_black") table.add_column(no_wrap=True) table.add_column() for f_ann in sorted(f_anns): table.add_row("• " + f_ann[0].name + ( f"\n [i dim]class:[/] <{f_ann[0].cls}>" if f_ann[0].cls else ""), f_ann[1] or "") console.print(Padding(table, (0, 0, 0, 4))) elif custom_params: # Print info about custom annotators this_box_style = box_style if any(a[1] for a in f_anns) else box.SIMPLE table = Table(title="[b]Annotations[/b]", box=this_box_style, show_header=False, title_justify="left", padding=(0, 2), pad_edge=False, border_style="bright_black") table.add_column() table.add_row("In order to use this annotator you first need to declare it in the 'custom_annotations' " "section of your corpus configuration and specify its arguments.") console.print(Padding(table, (0, 0, 0, 4))) # Config variables f_config = reverse_config_usage.get(f"{module_name}:{f_name}") if f_config: console.print() table = Table(title="[b]Configuration variables used[/b]", box=box_style, show_header=False, title_justify="left", padding=(0, 2), pad_edge=False, border_style="bright_black") table.add_column(no_wrap=True) table.add_column() for config_key in sorted(f_config): table.add_row("• " + config_key[0], config_key[1] or "") console.print(Padding(table, (0, 0, 0, 4))) # Arguments if (print_params and params) or custom_params: table = Table(title="[b]Arguments[/b]", box=box_style, show_header=False, title_justify="left", padding=(0, 2), pad_edge=False, border_style="bright_black") table.add_column(no_wrap=True) table.add_column() for p, (default, typ, li, optional) in params.items(): opt_str = "(optional) " if optional else "" typ_str = "list of " + typ.__name__ if li else typ.__name__ def_str = f", default: {repr(default)}" if default is not None else "" table.add_row("• " + p, f"{opt_str}{typ_str}{def_str}") console.print(Padding(table, (0, 0, 0, 4))) print()
def rule_helper(rule: RuleStorage, config: dict, storage: SnakeStorage, config_missing: bool = False, custom_rule_obj: Optional[dict] = None) -> bool: """ Populate rule with Snakemake input, output and parameter list. Return True if a Snakemake rule should be created. Args: rule: Object containing snakemake rule parameters. config: Dictionary containing the corpus configuration. storage: Object for saving information for all rules. config_missing: True if there is no corpus config file. custom_rule_obj: Custom annotation dictionary from corpus config. """ # Only create certain rules when config is missing if config_missing and not rule.modelbuilder: return False # Skip any annotator that is not available for the selected corpus language if rule.annotator_info["language"] and sparv_config.get("metadata.language") and \ sparv_config.get("metadata.language") not in rule.annotator_info["language"]: return False # Get this function's parameters params = OrderedDict( inspect.signature(rule.annotator_info["function"]).parameters) param_dict = make_param_dict(params) if rule.importer: rule.inputs.append( Path(get_source_path(), "{doc}." + rule.file_extension)) storage.all_importers.setdefault(rule.module_name, {}).setdefault( rule.f_name, { "description": rule.description, "params": param_dict }) if rule.target_name == sparv_config.get("import.importer"): # Exports always generate corpus text file rule.outputs.append(paths.work_dir / "{doc}" / io.TEXT_FILE) # If importer guarantees other outputs, add them to outputs list if rule.import_outputs: if isinstance(rule.import_outputs, Config): rule.import_outputs = sparv_config.get( rule.import_outputs, rule.import_outputs.default) annotations_ = set() renames = {} # Annotation list needs to be sorted to handle plain annotations before attributes for ann, target in sorted( util.parse_annotation_list(rule.import_outputs)): # Handle annotations renamed during import if target: source_ann, source_attr = BaseAnnotation(ann).split() if not source_attr: renames[ann] = target ann = target else: ann = io.join_annotation( renames.get(source_ann, source_ann), target) annotations_.add(ann) for element in annotations_: rule.outputs.append(paths.work_dir / get_annotation_path(element)) # If import.document_annotation has been specified, add it to outputs if not already there if sparv_config.get("import.document_annotation"): doc_ann_file = paths.work_dir / get_annotation_path( sparv_config.get("import.document_annotation")) if doc_ann_file not in rule.outputs: rule.outputs.append(doc_ann_file) if rule.exporter: storage.all_exporters.setdefault(rule.module_name, {}).setdefault( rule.f_name, { "description": rule.description, "params": param_dict }) elif rule.installer: storage.all_installers.setdefault(rule.module_name, {}).setdefault( rule.f_name, { "description": rule.description, "params": param_dict }) output_dirs = set() # Directories where export files are stored custom_params = set() if custom_rule_obj: if custom_rule_obj.get("params"): name_custom_rule(rule, storage) custom_params = set(custom_rule_obj.get("params").keys()) else: # This rule has already been populated, so don't process it again return False # Go though function parameters and handle based on type for param_name, param in params.items(): param_default_empty = param.default == inspect.Parameter.empty param_value: Any # Get parameter value, either from custom rule object or default value if custom_rule_obj: if param_name in custom_rule_obj["params"]: param_value = custom_rule_obj["params"][param_name] custom_params.remove(param_name) elif not param_default_empty: param_value = copy.deepcopy(param.default) else: raise util.SparvErrorMessage( f"Parameter '{param_name}' in custom rule '{rule.full_name}' has no value!", "sparv", "config") else: if param_default_empty: # This is probably an unused custom rule, so don't process it any further, # but save it in all_custom_annotators and all_annotators storage.all_custom_annotators.setdefault( rule.module_name, {}).setdefault(rule.f_name, { "description": rule.description, "params": param_dict }) storage.custom_targets.append( (rule.target_name, rule.description)) storage.all_annotators.setdefault( rule.module_name, {}).setdefault( rule.f_name, { "description": rule.description, "annotations": [], "params": param_dict }) return False else: param_value = copy.deepcopy(param.default) param_type, param_list, param_optional = registry.get_type_hint_type( param.annotation) # Output if issubclass(param_type, BaseOutput): if not isinstance(param_value, BaseOutput): if not param_value: return False param_value = param_type(param_value) rule.configs.update( registry.find_config_variables(param_value.name)) rule.classes.update(registry.find_classes(param_value.name)) missing_configs = param_value.expand_variables(rule.full_name) rule.missing_config.update(missing_configs) ann_path = get_annotation_path(param_value, data=param_type.data, common=param_type.common) if param_type.all_docs: rule.outputs.extend( map( Path, expand(escape_wildcards(paths.work_dir / ann_path), doc=get_source_files(storage.source_files)))) elif param_type.common: rule.outputs.append(paths.work_dir / ann_path) if rule.installer: storage.install_outputs[rule.target_name].append( paths.work_dir / ann_path) else: rule.outputs.append( get_annotation_path(param_value, data=param_type.data)) rule.parameters[param_name] = param_value if "{" in param_value: rule.wildcard_annotations.append(param_name) if rule.annotator: storage.all_annotators.setdefault( rule.module_name, {}).setdefault( rule.f_name, { "description": rule.description, "annotations": [], "params": param_dict }) storage.all_annotators[rule.module_name][ rule.f_name]["annotations"].append( (param_value, param_value.description)) # ModelOutput elif param_type == ModelOutput: rule.configs.update( registry.find_config_variables(param_value.name)) rule.classes.update(registry.find_classes(param_value.name)) rule.missing_config.update( param_value.expand_variables(rule.full_name)) model_path = param_value.path rule.outputs.append(model_path) rule.parameters[param_name] = ModelOutput(str(model_path)) storage.model_outputs.append(model_path) # Annotation elif issubclass(param_type, BaseAnnotation): if not isinstance(param_value, BaseAnnotation): if not param_value: return False param_value = param_type(param_value) rule.configs.update( registry.find_config_variables(param_value.name)) rule.classes.update(registry.find_classes(param_value.name)) missing_configs = param_value.expand_variables(rule.full_name) if (not param_value or missing_configs) and param_optional: rule.parameters[param_name] = None continue rule.missing_config.update(missing_configs) ann_path = get_annotation_path(param_value, data=param_type.data, common=param_type.common) if param_type.all_docs: rule.inputs.extend( expand(escape_wildcards(paths.work_dir / ann_path), doc=get_source_files(storage.source_files))) elif rule.exporter or rule.installer or param_type.common: rule.inputs.append(paths.work_dir / ann_path) else: rule.inputs.append(ann_path) rule.parameters[param_name] = param_value if "{" in param_value: rule.wildcard_annotations.append(param_name) # ExportAnnotations elif param_type in (ExportAnnotations, ExportAnnotationsAllDocs): if not isinstance(param_value, param_type): param_value = param_type(param_value) rule.parameters[param_name] = param_value source = param.default.config_name annotations = sparv_config.get(f"{source}", []) if not annotations: rule.missing_config.add(f"{source}") export_annotations = util.parse_annotation_list( annotations, add_plain_annotations=False) annotation_type = Annotation if param_type == ExportAnnotations else AnnotationAllDocs plain_annotations = set() possible_plain_annotations = [] for i, (export_annotation_name, export_name) in enumerate(export_annotations): annotation = annotation_type(export_annotation_name) rule.configs.update( registry.find_config_variables(annotation.name)) rule.classes.update(registry.find_classes(annotation.name)) rule.missing_config.update( annotation.expand_variables(rule.full_name)) export_annotations[i] = (annotation, export_name) plain_name, attr = annotation.split() if not attr: plain_annotations.add(plain_name) else: if plain_name not in possible_plain_annotations: possible_plain_annotations.append(plain_name) # Add plain annotations where needed for a in possible_plain_annotations: if a not in plain_annotations: export_annotations.append((annotation_type(a), None)) for annotation, export_name in export_annotations: if param.default.is_input: if param_type == ExportAnnotationsAllDocs: rule.inputs.extend( expand(escape_wildcards( paths.work_dir / get_annotation_path(annotation.name)), doc=get_source_files(storage.source_files))) else: rule.inputs.append( paths.work_dir / get_annotation_path(annotation.name)) rule.parameters[param_name].append((annotation, export_name)) # SourceAnnotations elif param_type == SourceAnnotations: rule.parameters[param_name] = sparv_config.get( f"{param.default.config_name}", None) # Corpus elif param.annotation == Corpus: rule.parameters[param_name] = Corpus( sparv_config.get("metadata.id")) # Language elif param.annotation == Language: rule.parameters[param_name] = Language( sparv_config.get("metadata.language")) # Document elif param.annotation == Document: rule.docs.append(param_name) # AllDocuments (all source documents) elif param_type == AllDocuments: rule.parameters[param_name] = AllDocuments( get_source_files(storage.source_files)) # Text elif param_type == Text: text_path = Path("{doc}") / io.TEXT_FILE if rule.exporter or rule.installer: rule.inputs.append(paths.work_dir / text_path) else: rule.inputs.append(text_path) rule.parameters[param_name] = param_value # Model elif param_type == Model: if param_value is not None: if param_list: rule.parameters[param_name] = [] for model in param_value: if not isinstance(model, Model): model = Model(param_value) rule.configs.update( registry.find_config_variables(model.name)) rule.classes.update(registry.find_classes(model.name)) rule.missing_config.update( model.expand_variables(rule.full_name)) rule.inputs.append(model.path) rule.parameters[param_name].append( Model(str(model.path))) else: if not isinstance(param_value, Model): param_value = Model(param_value) rule.configs.update( registry.find_config_variables(param_value.name)) rule.classes.update(registry.find_classes( param_value.name)) rule.missing_config.update( param_value.expand_variables(rule.full_name)) rule.inputs.append(param_value.path) rule.parameters[param_name] = Model(str(param_value.path)) # Binary elif param.annotation in (Binary, BinaryDir): rule.configs.update(registry.find_config_variables(param.default)) rule.classes.update(registry.find_classes(param.default)) param_value, missing_configs = registry.expand_variables( param.default, rule.full_name) rule.missing_config.update(missing_configs) binary = util.find_binary(param_value, executable=False, allow_dir=param.annotation == BinaryDir) if not binary: rule.missing_binaries.add(param_value) binary = Path(binary if binary else param_value) rule.inputs.append(binary) rule.parameters[param_name] = param.annotation(binary) # Source elif param.annotation == Source: rule.parameters[param_name] = Source(get_source_path()) # Export elif param.annotation == Export: rule.configs.update(registry.find_config_variables(param.default)) rule.classes.update(registry.find_classes(param.default)) param_value, missing_configs = registry.expand_variables( param.default, rule.full_name) rule.missing_config.update(missing_configs) if param.default.absolute_path: export_path = Path(param_value) else: export_path = paths.export_dir / param_value output_dirs.add(export_path.parent) rule.outputs.append(export_path) rule.parameters[param_name] = Export(str(export_path)) if "{doc}" in rule.parameters[param_name]: rule.doc_annotations.append(param_name) if "{" in param_value: rule.wildcard_annotations.append(param_name) # ExportInput elif param.annotation == ExportInput: rule.configs.update(registry.find_config_variables(param.default)) rule.classes.update(registry.find_classes(param.default)) param_value, missing_configs = registry.expand_variables( param.default, rule.full_name) rule.missing_config.update(missing_configs) if param.default.absolute_path: rule.parameters[param_name] = ExportInput(param_value) else: rule.parameters[param_name] = ExportInput(paths.export_dir / param_value) if param.default.all_docs: rule.inputs.extend( expand(escape_wildcards(rule.parameters[param_name]), doc=get_source_files(storage.source_files))) else: rule.inputs.append(Path(rule.parameters[param_name])) if "{" in rule.parameters[param_name]: rule.wildcard_annotations.append(param_name) # Config elif isinstance(param_value, Config): rule.configs.add(param_value.name) config_value = sparv_config.get(param_value, sparv_config.Unset) if config_value is sparv_config.Unset: if param_value.default is not None: config_value = param_value.default elif param_optional: config_value = None else: rule.missing_config.add(param_value) rule.parameters[param_name] = config_value # Everything else else: rule.parameters[param_name] = param_value # For custom rules, warn the user of any unknown parameters if custom_params: print_sparv_warning( "The parameter{} '{}' used in one of your custom rules " "do{} not exist in {}.".format( "s" if len(custom_params) > 1 else "", "', '".join(custom_params), "es" if len(custom_params) == 1 else "", rule.full_name)) storage.all_rules.append(rule) # Add to rule lists in storage update_storage(storage, rule) # Add exporter dirs (used for informing user) if rule.exporter: if rule.abstract: output_dirs = set([p.parent for p in rule.inputs]) rule.export_dirs = [str(p / "_")[:-1] for p in output_dirs] if rule.missing_config: log_handler.messages["missing_configs"][rule.full_name].update( [c for c in rule.missing_config if not c.startswith("<")]) log_handler.messages["missing_classes"][rule.full_name].update( [c[1:-1] for c in rule.missing_config if c.startswith("<")]) if rule.missing_binaries: log_handler.messages["missing_binaries"][rule.full_name].update( rule.missing_binaries) if config.get("debug"): print() console.print("[b]{}:[/b] {}".format(rule.module_name.upper(), rule.f_name)) print() console.print(" [b]INPUTS[/b]") for i in rule.inputs: print(" {}".format(i)) print() console.print(" [b]OUTPUTS[/b]") for o in rule.outputs: print(" {}".format(o)) print() console.print(" [b]PARAMETERS[/b]") for p in rule.parameters: print(" {} = {!r}".format(p, rule.parameters[p])) print() print() return True
def print_sparv_info(msg): """Format msg into a Sparv info message.""" console.print(f"[green]{msg}[/green]", highlight=False)
def print_sparv_warning(msg): """Format msg into a Sparv warning message.""" console.print(f"[yellow]WARNING: {msg}[/yellow]", highlight=False)