예제 #1
0
 def update_config(self, answers: dict):
     """Add answers to corpus_config dict and update full config."""
     for answer in answers:
         config.set_value(answer,
                          answers[answer],
                          config_dict=self.corpus_config)
     config.update_config(self.corpus_config)
예제 #2
0
 def update_export_annotations(self, selected_annotations):
     """Add selected annotations to corpus config."""
     annotations = []
     for module in selected_annotations:
         for f_name in selected_annotations[module]:
             for ann in selected_annotations[module][f_name]:
                 value = ann["annotation"]
                 if "wildcards" in ann:
                     # Replace wildcards with values
                     for wc, wcval in ann["wildcards"].items():
                         value = re.sub(f"{{{wc}}}", wcval, value)
                 annotations.append(value)
     config.set_value("export.annotations",
                      annotations,
                      config_dict=self.corpus_config)
예제 #3
0
def load_config(snakemake_config):
    """Load corpus config and override the corpus language (if needed)."""
    # Find corpus config
    corpus_config_file = Path.cwd() / paths.config_file
    if corpus_config_file.is_file():
        config_missing = False
        # Read config
        sparv_config.load_config(corpus_config_file)

        # Add classes from config to registry
        registry.annotation_classes[
            "config_classes"] = sparv_config.config.get("classes", {})
    else:
        config_missing = True

    # Some commands may override the corpus language
    if snakemake_config.get("language"):
        sparv_config.set_value("metadata.language",
                               snakemake_config["language"])

    return config_missing
예제 #4
0
def _add_to_registry(annotator):
    """Add function to annotator registry. Used by annotator."""
    module_name = annotator["module_name"]
    f_name = annotator[
        "function"].__name__ if not annotator["name"] else annotator["name"]
    rule_name = f"{module_name}:{f_name}"

    if annotator["language"]:
        # Add to set of supported languages...
        languages.update(annotator["language"])
        # ... but skip annotators for other languages than the one specified in the config
        if sparv_config.get("metadata.language") and sparv_config.get(
                "metadata.language") not in annotator["language"]:
            return

    # Add config variables to config
    if annotator["config"]:
        for c in annotator["config"]:
            handle_config(c, module_name, rule_name)

    # Handle document annotation for selected importer
    if annotator[
            "type"] == Annotator.importer and rule_name == sparv_config.get(
                "import.importer"):
        if annotator["document_annotation"] and not sparv_config.get(
                "classes.text"):
            sparv_config.set_value("import.document_annotation",
                                   annotator["document_annotation"])
            sparv_config.handle_document_annotation()

    for param, val in inspect.signature(
            annotator["function"]).parameters.items():
        if isinstance(val.default, BaseOutput):
            ann = val.default
            cls = val.default.cls
            ann_name, attr = ann.split()

            # Make sure annotation names include module names as prefix
            if not attr:
                if not ann_name.startswith(module_name + "."):
                    raise ValueError(
                        "Output annotation '{}' in module '{}' doesn't include module "
                        "name as prefix.".format(ann_name, module_name))
            else:
                if not attr.startswith(module_name + "."):
                    raise ValueError(
                        "Output annotation '{}' in module '{}' doesn't include module "
                        "name as prefix in attribute.".format(
                            ann, module_name))

            # Add to class registry
            if cls:
                cls_target = None
                if ":" in cls and not cls.startswith(
                        ":") and ann_name and attr:
                    cls_target = ann.name
                elif cls.startswith(":") and attr:
                    cls_target = attr
                elif ":" not in cls:
                    cls_target = ann_name
                else:
                    print("Malformed class name: '{}'".format(cls))

                if cls_target:
                    if annotator["language"]:
                        if not annotator["language"]:
                            all_module_classes[None][cls].append(cls_target)
                        else:
                            for language in annotator["language"]:
                                all_module_classes[language][cls].append(
                                    cls_target)

                    # Only add classes for relevant languages
                    if not annotator["language"] or (
                            annotator["language"]
                            and sparv_config.get("metadata.language")
                            in annotator["language"]):
                        annotation_classes["module_classes"][cls].append(
                            cls_target)

        elif isinstance(val.default, ModelOutput):
            modeldir = val.default.name.split("/")[0]
            if not modeldir.startswith(module_name):
                raise ValueError(
                    "Output model '{}' in module '{}' doesn't include module "
                    "name as sub directory.".format(val.default, module_name))
        elif isinstance(val.default, Config):
            sparv_config.add_config_usage(val.default.name, rule_name)
        elif isinstance(val.default,
                        (ExportAnnotations, ExportAnnotationsAllDocs)):
            sparv_config.add_config_usage(val.default.config_name, rule_name)
            annotation_sources.add(val.default.config_name)

    if module_name not in modules:
        modules[module_name] = Module(module_name)
    if f_name in modules[module_name].functions:
        print(
            "Annotator function '{}' collides with other function with same name in module '{}'."
            .format(f_name, module_name))
    else:
        del annotator["module_name"]
        del annotator["name"]
        modules[module_name].functions[f_name] = annotator
예제 #5
0
    def edit_config(self,
                    selected_annotations,
                    annotator_max_len,
                    show_optional: bool = False):
        """Ask the user for required config variables."""
        def get_dependencies(module, f):
            """Recursively get all annotators needed by an annotator."""
            if used_annotators[module].get(f) is not None:
                return
            used_annotators[module].setdefault(f, {})
            for input_file in self.snake_storage.all_annotators[module][f][
                    "rule"].inputs:
                if input_file in self.output_to_annotators:
                    for annotator in self.output_to_annotators[input_file]:
                        get_dependencies(*annotator)

        config_annotator = None

        while True:
            # We need to reload annotators in case any configuration has changed
            config.update_config(self.corpus_config)
            self.update_annotators()

            # Find all dependencies for the selected annotations
            used_annotators = defaultdict(dict)
            for module in selected_annotations:
                for f_name in selected_annotations[module]:
                    if selected_annotations[module][f_name]:
                        get_dependencies(module, f_name)

            missing_configs = False

            # Check for any config variables that MUST be set (i.e. they have no default values we can use)
            for module in used_annotators:
                for f_name in used_annotators[module]:
                    missing_config = self.snake_storage.all_annotators[module][
                        f_name]["rule"].missing_config
                    if any(cfg for cfg in missing_config
                           if not cfg.startswith("<")):
                        missing_configs = True
                        config_values = self.q([{
                            "type":
                            "text",
                            "name":
                            config_key,
                            "message":
                            "The following config variable needs to be set.\n"
                            "Description: {}\n{}:".format(
                                config.get_config_description(config_key),
                                config_key)
                        } for config_key in missing_config if
                                                not config_key.startswith("<")
                                                ],
                                               clear=True)

                        for key, value in config_values.items():
                            config.set_value(key, value)
                            config.set_value(key,
                                             value,
                                             config_dict=self.corpus_config)
            if missing_configs:
                continue

            if not show_optional:
                return

            config_annotators = []
            preselected = None
            for module in sorted(used_annotators):
                for a in sorted(used_annotators[module]):
                    config_annotators.append({
                        "name":
                        "{:{width}} {}  {}".format(
                            "{}:{}".format(module, a),
                            "({})".format(
                                len(selected_annotations[module].get(a, [])))
                            if selected_annotations[module].get(a) else "   ",
                            self.snake_storage.all_annotators[module][a]
                            ["rule"].description,
                            width=annotator_max_len +
                            (0 if not self.snake_storage.all_annotators[module]
                             [a]["rule"].configs else 2)),
                        "value": (module, a),
                        "short":
                        "{}:{}".format(module, a),
                        "disabled":
                        not self.snake_storage.all_annotators[module][a]
                        ["rule"].configs
                    })
                    if config_annotator == (module, a):
                        preselected = config_annotators[-1]

            config_annotator = self.q(dict(
                {
                    "type":
                    "select",
                    "name":
                    "annotator",
                    "message":
                    "The following annotators will be used for your corpus, either directly or indirectly by the"
                    " annotators you selected. You may edit their config variables if you wish.",
                    "choices": [{
                        "name": DONE,
                        "value": "_done"
                    }] + config_annotators
                }, **{"default": preselected} if preselected else {}),
                                      clear=True)["annotator"]

            if config_annotator == "_done":
                break
            else:
                module_name, f_name = config_annotator
                max_cfg_len = max(
                    len(cfg)
                    for cfg in self.snake_storage.all_annotators[module_name]
                    [f_name]["rule"].configs)
                config_choice = None
                preselected_key = None
                while True:
                    config_choices = []
                    for cfg in self.snake_storage.all_annotators[module_name][
                            f_name]["rule"].configs:
                        config_choices.append({
                            "name":
                            "{:{width}}  {}".format(
                                cfg,
                                config.get_config_description(cfg),
                                width=max_cfg_len),
                            "value":
                            cfg
                        })
                        if config_choice == cfg:
                            preselected_key = config_choices[-1]

                    config_choice = self.q(dict(
                        {
                            "type":
                            "select",
                            "name":
                            "config",
                            "message":
                            "What configuration variable do you want to edit?",
                            "choices": [{
                                "name": DONE,
                                "value": "_done"
                            }] + config_choices
                        }, **{"default": preselected_key}
                        if preselected_key else {}),
                                           clear=True)["config"]

                    if config_choice == "_done":
                        break
                    else:
                        config_value = self.q([{
                            "type":
                            "text",
                            "name":
                            "value",
                            "default":
                            config.get(config_choice) or "",
                            "message":
                            "Set value of config variable '{}':".format(
                                config_choice)
                        }])["value"]

                        # Only save value if changed
                        if config_value != (config.get(config_choice) or ""):
                            config.set_value(config_choice, config_value)
                            config.set_value(config_choice,
                                             config_value,
                                             config_dict=self.corpus_config)
예제 #6
0
    def run(self):
        """Run the Sparv corpus set-up wizard."""
        # Load default config and any existing corpus config
        self.load_config()

        # Temporarily unset corpus language to allow all modules to be loaded
        language = config.get("metadata.language")
        config.set_value("metadata.language", None)

        # Load all available annotators
        self.update_annotators()

        # Restore language
        config.set_value("metadata.language", language)

        # Build module wizard index
        wizard_from_config = {}
        wizard_from_module = defaultdict(set)
        for w in registry.wizards:
            for config_variable in w[1]:
                wizard_from_config[config_variable] = w
                wizard_from_module[config_variable.split(".")[0]].add(w)

        # Initial question to check prerequisites
        self.prerequisites()

        # Start with metadata questions
        questions = []
        for w in wizard_from_module["metadata"]:
            questions.extend(self.get_module_wizard(w))

        self.update_config(self.q(questions, clear=True))

        # Importer choice
        questions = []
        for w in wizard_from_module["import"]:
            questions.extend(self.get_module_wizard(w))

        self.update_config(self.q(questions))

        # Now that the user has selected a language, update the class dict in registry...
        for cls, targets in registry.all_module_classes[config.get(
                "metadata.language")].items():
            registry.annotation_classes["module_classes"][cls].extend(targets)

        # ...and rebuild annotator list
        self.update_annotators()

        # Ask user if they want to scan source files
        self.scan_source()

        # Choose document annotation
        self.select_document_annotation()

        # Select source annotations to keep
        questions = []
        for w in wizard_from_module["export"]:
            questions.extend(self.get_module_wizard(w))

        self.update_config(self.q(questions))

        # Parse annotations from existing config
        selected_annotations = defaultdict(dict)
        self.parse_config_annotations(selected_annotations)

        # Select annotations
        annotator_max_len = self.select_annotations(selected_annotations)

        # Select classes if needed
        has_class_choices = self.select_classes(selected_annotations)

        # Select wildcards if needed
        has_wildcard_choices = self.select_wildcards(selected_annotations)

        # Add selected annotations to config
        self.update_export_annotations(selected_annotations)

        # Set config variables
        self.edit_config(selected_annotations, annotator_max_len)

        # We're done collecting the required data. Let the user edit further if they want to.
        while True:
            choices = [{"name": DONE, "value": "done"}]

            if has_class_choices:
                choices.append({
                    "name": "Edit class choices",
                    "value": "class"
                })
            if has_wildcard_choices:
                choices.append({
                    "name": "Edit wildcard references",
                    "value": "wildcard"
                })
            choices.append({
                "name": "Edit annotator configurations",
                "value": "config"
            })

            choice = self.q(
                {
                    "name":
                    "choice",
                    "type":
                    "select",
                    "choices":
                    choices,
                    "message":
                    "All the necessary data has been collected, but you may do further customization by "
                    "selecting one of the options below."
                },
                clear=True)["choice"]

            if choice == "done":
                break
            elif choice == "class":
                has_class_choices = self.select_classes(selected_annotations,
                                                        always_ask=True)
            elif choice == "wildcard":
                has_wildcard_choices = self.select_wildcards(
                    selected_annotations, always_ask=True)
                self.update_export_annotations(selected_annotations)
            elif choice == "config":
                self.edit_config(selected_annotations,
                                 annotator_max_len,
                                 show_optional=True)

        self.save_config()