def update_config(self, answers: dict): """Add answers to corpus_config dict and update full config.""" for answer in answers: config.set_value(answer, answers[answer], config_dict=self.corpus_config) config.update_config(self.corpus_config)
def update_export_annotations(self, selected_annotations): """Add selected annotations to corpus config.""" annotations = [] for module in selected_annotations: for f_name in selected_annotations[module]: for ann in selected_annotations[module][f_name]: value = ann["annotation"] if "wildcards" in ann: # Replace wildcards with values for wc, wcval in ann["wildcards"].items(): value = re.sub(f"{{{wc}}}", wcval, value) annotations.append(value) config.set_value("export.annotations", annotations, config_dict=self.corpus_config)
def load_config(snakemake_config): """Load corpus config and override the corpus language (if needed).""" # Find corpus config corpus_config_file = Path.cwd() / paths.config_file if corpus_config_file.is_file(): config_missing = False # Read config sparv_config.load_config(corpus_config_file) # Add classes from config to registry registry.annotation_classes[ "config_classes"] = sparv_config.config.get("classes", {}) else: config_missing = True # Some commands may override the corpus language if snakemake_config.get("language"): sparv_config.set_value("metadata.language", snakemake_config["language"]) return config_missing
def _add_to_registry(annotator): """Add function to annotator registry. Used by annotator.""" module_name = annotator["module_name"] f_name = annotator[ "function"].__name__ if not annotator["name"] else annotator["name"] rule_name = f"{module_name}:{f_name}" if annotator["language"]: # Add to set of supported languages... languages.update(annotator["language"]) # ... but skip annotators for other languages than the one specified in the config if sparv_config.get("metadata.language") and sparv_config.get( "metadata.language") not in annotator["language"]: return # Add config variables to config if annotator["config"]: for c in annotator["config"]: handle_config(c, module_name, rule_name) # Handle document annotation for selected importer if annotator[ "type"] == Annotator.importer and rule_name == sparv_config.get( "import.importer"): if annotator["document_annotation"] and not sparv_config.get( "classes.text"): sparv_config.set_value("import.document_annotation", annotator["document_annotation"]) sparv_config.handle_document_annotation() for param, val in inspect.signature( annotator["function"]).parameters.items(): if isinstance(val.default, BaseOutput): ann = val.default cls = val.default.cls ann_name, attr = ann.split() # Make sure annotation names include module names as prefix if not attr: if not ann_name.startswith(module_name + "."): raise ValueError( "Output annotation '{}' in module '{}' doesn't include module " "name as prefix.".format(ann_name, module_name)) else: if not attr.startswith(module_name + "."): raise ValueError( "Output annotation '{}' in module '{}' doesn't include module " "name as prefix in attribute.".format( ann, module_name)) # Add to class registry if cls: cls_target = None if ":" in cls and not cls.startswith( ":") and ann_name and attr: cls_target = ann.name elif cls.startswith(":") and attr: cls_target = attr elif ":" not in cls: cls_target = ann_name else: print("Malformed class name: '{}'".format(cls)) if cls_target: if annotator["language"]: if not annotator["language"]: all_module_classes[None][cls].append(cls_target) else: for language in annotator["language"]: all_module_classes[language][cls].append( cls_target) # Only add classes for relevant languages if not annotator["language"] or ( annotator["language"] and sparv_config.get("metadata.language") in annotator["language"]): annotation_classes["module_classes"][cls].append( cls_target) elif isinstance(val.default, ModelOutput): modeldir = val.default.name.split("/")[0] if not modeldir.startswith(module_name): raise ValueError( "Output model '{}' in module '{}' doesn't include module " "name as sub directory.".format(val.default, module_name)) elif isinstance(val.default, Config): sparv_config.add_config_usage(val.default.name, rule_name) elif isinstance(val.default, (ExportAnnotations, ExportAnnotationsAllDocs)): sparv_config.add_config_usage(val.default.config_name, rule_name) annotation_sources.add(val.default.config_name) if module_name not in modules: modules[module_name] = Module(module_name) if f_name in modules[module_name].functions: print( "Annotator function '{}' collides with other function with same name in module '{}'." .format(f_name, module_name)) else: del annotator["module_name"] del annotator["name"] modules[module_name].functions[f_name] = annotator
def edit_config(self, selected_annotations, annotator_max_len, show_optional: bool = False): """Ask the user for required config variables.""" def get_dependencies(module, f): """Recursively get all annotators needed by an annotator.""" if used_annotators[module].get(f) is not None: return used_annotators[module].setdefault(f, {}) for input_file in self.snake_storage.all_annotators[module][f][ "rule"].inputs: if input_file in self.output_to_annotators: for annotator in self.output_to_annotators[input_file]: get_dependencies(*annotator) config_annotator = None while True: # We need to reload annotators in case any configuration has changed config.update_config(self.corpus_config) self.update_annotators() # Find all dependencies for the selected annotations used_annotators = defaultdict(dict) for module in selected_annotations: for f_name in selected_annotations[module]: if selected_annotations[module][f_name]: get_dependencies(module, f_name) missing_configs = False # Check for any config variables that MUST be set (i.e. they have no default values we can use) for module in used_annotators: for f_name in used_annotators[module]: missing_config = self.snake_storage.all_annotators[module][ f_name]["rule"].missing_config if any(cfg for cfg in missing_config if not cfg.startswith("<")): missing_configs = True config_values = self.q([{ "type": "text", "name": config_key, "message": "The following config variable needs to be set.\n" "Description: {}\n{}:".format( config.get_config_description(config_key), config_key) } for config_key in missing_config if not config_key.startswith("<") ], clear=True) for key, value in config_values.items(): config.set_value(key, value) config.set_value(key, value, config_dict=self.corpus_config) if missing_configs: continue if not show_optional: return config_annotators = [] preselected = None for module in sorted(used_annotators): for a in sorted(used_annotators[module]): config_annotators.append({ "name": "{:{width}} {} {}".format( "{}:{}".format(module, a), "({})".format( len(selected_annotations[module].get(a, []))) if selected_annotations[module].get(a) else " ", self.snake_storage.all_annotators[module][a] ["rule"].description, width=annotator_max_len + (0 if not self.snake_storage.all_annotators[module] [a]["rule"].configs else 2)), "value": (module, a), "short": "{}:{}".format(module, a), "disabled": not self.snake_storage.all_annotators[module][a] ["rule"].configs }) if config_annotator == (module, a): preselected = config_annotators[-1] config_annotator = self.q(dict( { "type": "select", "name": "annotator", "message": "The following annotators will be used for your corpus, either directly or indirectly by the" " annotators you selected. You may edit their config variables if you wish.", "choices": [{ "name": DONE, "value": "_done" }] + config_annotators }, **{"default": preselected} if preselected else {}), clear=True)["annotator"] if config_annotator == "_done": break else: module_name, f_name = config_annotator max_cfg_len = max( len(cfg) for cfg in self.snake_storage.all_annotators[module_name] [f_name]["rule"].configs) config_choice = None preselected_key = None while True: config_choices = [] for cfg in self.snake_storage.all_annotators[module_name][ f_name]["rule"].configs: config_choices.append({ "name": "{:{width}} {}".format( cfg, config.get_config_description(cfg), width=max_cfg_len), "value": cfg }) if config_choice == cfg: preselected_key = config_choices[-1] config_choice = self.q(dict( { "type": "select", "name": "config", "message": "What configuration variable do you want to edit?", "choices": [{ "name": DONE, "value": "_done" }] + config_choices }, **{"default": preselected_key} if preselected_key else {}), clear=True)["config"] if config_choice == "_done": break else: config_value = self.q([{ "type": "text", "name": "value", "default": config.get(config_choice) or "", "message": "Set value of config variable '{}':".format( config_choice) }])["value"] # Only save value if changed if config_value != (config.get(config_choice) or ""): config.set_value(config_choice, config_value) config.set_value(config_choice, config_value, config_dict=self.corpus_config)
def run(self): """Run the Sparv corpus set-up wizard.""" # Load default config and any existing corpus config self.load_config() # Temporarily unset corpus language to allow all modules to be loaded language = config.get("metadata.language") config.set_value("metadata.language", None) # Load all available annotators self.update_annotators() # Restore language config.set_value("metadata.language", language) # Build module wizard index wizard_from_config = {} wizard_from_module = defaultdict(set) for w in registry.wizards: for config_variable in w[1]: wizard_from_config[config_variable] = w wizard_from_module[config_variable.split(".")[0]].add(w) # Initial question to check prerequisites self.prerequisites() # Start with metadata questions questions = [] for w in wizard_from_module["metadata"]: questions.extend(self.get_module_wizard(w)) self.update_config(self.q(questions, clear=True)) # Importer choice questions = [] for w in wizard_from_module["import"]: questions.extend(self.get_module_wizard(w)) self.update_config(self.q(questions)) # Now that the user has selected a language, update the class dict in registry... for cls, targets in registry.all_module_classes[config.get( "metadata.language")].items(): registry.annotation_classes["module_classes"][cls].extend(targets) # ...and rebuild annotator list self.update_annotators() # Ask user if they want to scan source files self.scan_source() # Choose document annotation self.select_document_annotation() # Select source annotations to keep questions = [] for w in wizard_from_module["export"]: questions.extend(self.get_module_wizard(w)) self.update_config(self.q(questions)) # Parse annotations from existing config selected_annotations = defaultdict(dict) self.parse_config_annotations(selected_annotations) # Select annotations annotator_max_len = self.select_annotations(selected_annotations) # Select classes if needed has_class_choices = self.select_classes(selected_annotations) # Select wildcards if needed has_wildcard_choices = self.select_wildcards(selected_annotations) # Add selected annotations to config self.update_export_annotations(selected_annotations) # Set config variables self.edit_config(selected_annotations, annotator_max_len) # We're done collecting the required data. Let the user edit further if they want to. while True: choices = [{"name": DONE, "value": "done"}] if has_class_choices: choices.append({ "name": "Edit class choices", "value": "class" }) if has_wildcard_choices: choices.append({ "name": "Edit wildcard references", "value": "wildcard" }) choices.append({ "name": "Edit annotator configurations", "value": "config" }) choice = self.q( { "name": "choice", "type": "select", "choices": choices, "message": "All the necessary data has been collected, but you may do further customization by " "selecting one of the options below." }, clear=True)["choice"] if choice == "done": break elif choice == "class": has_class_choices = self.select_classes(selected_annotations, always_ask=True) elif choice == "wildcard": has_wildcard_choices = self.select_wildcards( selected_annotations, always_ask=True) self.update_export_annotations(selected_annotations) elif choice == "config": self.edit_config(selected_annotations, annotator_max_len, show_optional=True) self.save_config()