示例#1
0
def read_yaml(yaml_file):
    """Read YAML file and handle errors."""
    try:
        with open(yaml_file) as f:
            data = yaml.load(f, Loader=yaml.FullLoader)
    except yaml.scanner.ScannerError as e:
        raise util.SparvErrorMessage(
            "An error occurred while reading the configuration file:\n" +
            str(e))
    except FileNotFoundError:
        raise util.SparvErrorMessage(
            f"Could not find the config file '{yaml_file}'")

    return data or {}
示例#2
0
def validate_config(config_dict=None, structure=None, parent=""):
    """Make sure the corpus config doesn't contain invalid keys."""
    config_dict = config_dict or config
    structure = structure or config_structure
    for key in config_dict:
        path = (parent + "." + key) if parent else key
        if key not in structure:
            if not parent:
                raise util.SparvErrorMessage(
                    f"Unknown key in config file: '{path}'. No module with that name found.",
                    module="sparv",
                    function="config")
            else:
                module_name = parent.split(".", 1)[0]
                raise util.SparvErrorMessage(
                    f"Unknown key in config file: '{path}'. The module '{module_name}' "
                    f"doesn't have an option with that name.",
                    module="sparv",
                    function="config")
        elif not structure[key].get("_source"):
            validate_config(config_dict[key], structure[key], path)
示例#3
0
def handle_document_annotation():
    """Copy document annotation to text class."""
    doc_elem = get("import.document_annotation")

    # Make sure that if both classes.text and import.document_annotation are set, that they have the same value
    if get("classes.text") and doc_elem and get("classes.text") != doc_elem:
        raise util.SparvErrorMessage(
            "The config keys 'classes.text' and 'import.document_annotation' can't have different values.",
            "sparv", "config")

    # If import.document_annotation is set, copy value to classes.text
    if doc_elem:
        set_default("classes.text", doc_elem)
示例#4
0
def get_source_files(source_files) -> List[str]:
    """Get list of all available source files."""
    if not source_files:
        if not sparv_config.get("import.importer"):
            raise util.SparvErrorMessage(
                "The config variable 'import.importer' must not be empty.",
                "sparv")
        try:
            importer_module, _, importer_function = sparv_config.get(
                "import.importer").partition(":")
            file_extension = registry.modules[importer_module].functions[
                importer_function]["file_extension"]
        except KeyError:
            raise util.SparvErrorMessage(
                "Could not find the importer '{}'. Make sure the 'import.importer' config value refers to an "
                "existing importer.".format(
                    sparv_config.get("import.importer")), "sparv")
        source_files = [
            f[1][0] for f in snakemake.utils.listfiles(
                Path(get_source_path(), "{file}." + file_extension))
        ]
    return source_files
示例#5
0
def validate_module_config():
    """Make sure that modules don't try to access undeclared config keys."""
    for config_key in config_usage:
        try:
            _get(config_key, config_structure)
        except KeyError:
            annotators = config_usage[config_key]
            raise util.SparvErrorMessage(
                "The annotator{} {} {} trying to access the config key '{}' which isn't declared anywhere."
                .format("s" if len(annotators) > 1 else "",
                        ", ".join(annotators),
                        "are" if len(annotators) > 1 else "is", config_key),
                "sparv", "config")
示例#6
0
def vrt_scrambled(
        doc: Document = Document(),
        out: Export = Export("vrt_scrambled/{doc}.vrt"),
        chunk: Annotation = Annotation("[cwb.scramble_on]"),
        chunk_order: Annotation = Annotation(
            "[cwb.scramble_on]:misc.number_random"),
        token: Annotation = Annotation("<token>"),
        word: Annotation = Annotation("[export.word]"),
        annotations: ExportAnnotations = ExportAnnotations("cwb.annotations"),
        source_annotations: SourceAnnotations = SourceAnnotations(
            "cwb.source_annotations"),
        remove_namespaces: bool = Config("export.remove_module_namespaces",
                                         False),
        sparv_namespace: str = Config("export.sparv_namespace"),
        source_namespace: str = Config("export.source_namespace")):
    """Export annotations to vrt in scrambled order."""
    # Get annotation spans, annotations list etc.
    annotation_list, token_attributes, export_names = util.get_annotation_names(
        annotations,
        source_annotations,
        doc=doc,
        token_name=token.name,
        remove_namespaces=remove_namespaces,
        sparv_namespace=sparv_namespace,
        source_namespace=source_namespace)
    if chunk not in annotation_list:
        raise util.SparvErrorMessage(
            "The annotation used for scrambling ({}) needs to be included in the output."
            .format(chunk))
    span_positions, annotation_dict = util.gather_annotations(
        annotation_list, export_names, doc=doc, split_overlaps=True)

    # Read words and document ID
    word_annotation = list(word.read())
    chunk_order_data = list(chunk_order.read())

    # Reorder chunks and open/close tags in correct order
    new_span_positions = util.scramble_spans(span_positions, chunk.name,
                                             chunk_order_data)

    # Make vrt format
    vrt_data = create_vrt(new_span_positions, token.name, word_annotation,
                          token_attributes, annotation_dict, export_names)

    # Create export dir
    os.makedirs(os.path.dirname(out), exist_ok=True)

    # Write result to file
    with open(out, "w") as f:
        f.write(vrt_data)
    log.info("Exported: %s", out)
示例#7
0
def install_json(jsonfile: ExportInput = ExportInput("[metadata.id].json"),
                 out: OutputCommonData = OutputCommonData(
                     "sbx_metadata.install_json_export_marker"),
                 export_path: str = Config("sbx_metadata.json_export_path"),
                 host: str = Config("sbx_metadata.json_export_host")):
    """Copy JSON metadata to remote host."""
    if not host:
        raise util.SparvErrorMessage(
            "'sbx_metadata.json_export_host' not set! JSON export not installed."
        )
    filename = Path(jsonfile).name
    remote_file_path = os.path.join(export_path, filename)
    util.install_file(host, jsonfile, remote_file_path)
    out.write("")
def install_metashare(
        xmlfile: ExportInput = ExportInput("sbx_metadata/[metadata.id].xml"),
        out: OutputCommonData = OutputCommonData(
            "sbx_metadata.install_metashare_marker"),
        export_path: str = Config("sbx_metadata.metashare_path"),
        host: str = Config("sbx_metadata.metashare_host")):
    """Copy META-SHARE file to remote host."""
    if not host:
        raise util.SparvErrorMessage(
            "'sbx_metadata.metashare_host' not set! META-SHARE export not installed."
        )
    filename = Path(xmlfile).name
    remote_file_path = os.path.join(export_path, filename)
    util.install_file(host, xmlfile, remote_file_path)
    out.write("")
示例#9
0
def replace_list(chunk: Annotation,
                 out: Output,
                 find: str = "",
                 sub: str = ""):
    """Find and replace annotations.

    Find string must match whole annotation.
    find and sub are whitespace separated lists of words to replace and their replacement.
    """
    find = find.split()
    sub = sub.split()
    if len(find) != len(sub):
        raise util.SparvErrorMessage(
            "Find and sub must have the same number of words.")
    translate = dict((f, s) for (f, s) in zip(find, sub))
    out.write((translate.get(val, val) for val in chunk.read()))
def make_standard_xml_export(xml_export, corpus_id: str):
    """Make license info object for standard XML export."""
    if xml_export in ("scrambled", "original"):
        item = {
            "licence": "CC-BY",
            "restriction": "attribution",
            "download": f"{MENINGSMANGDER_URL}/{corpus_id}.xml.bz2",
            "type": "corpus",
            "format": "XML"
        }
        if xml_export == "scrambled":
            item[
                "info"] = "this file contains a scrambled version of the corpus"
        return item
    elif not xml_export:
        return
    else:
        raise util.SparvErrorMessage(
            f"Invalid config value for sbx_metadata.xml_export: '{xml_export}'. "
            "Possible values: 'scrambled', 'original', False")
示例#11
0
def load_config(config_file: Optional[str],
                config_dict: Optional[dict] = None) -> None:
    """Load both default config and corpus config and merge into one config structure.

    Args:
        config_file: Path to corpus config file. If None, only the default config is read.
        config_dict: Get corpus config from dictionary instead of config file.
    """
    # Read default config
    if DEFAULT_CONFIG.is_file():
        default_config = read_yaml(DEFAULT_CONFIG)
    else:
        log.warning(
            "Default config file is missing: {}".format(DEFAULT_CONFIG))
        default_config = {}
    default_classes = default_config.get("classes", {})

    if config_file:
        # Read corpus config
        global config_user
        config_user = read_yaml(config_file) or {}

        def handle_parents(cfg, current_dir="."):
            """Combine parent configs recursively."""
            combined_parents = {}
            if cfg.get(PARENT):
                parents = cfg[PARENT]
                if isinstance(parents, str):
                    parents = [parents]
                for parent in parents:
                    parent_path = Path(current_dir, parent)
                    config_parent = read_yaml(parent_path)
                    config_parent = handle_parents(config_parent,
                                                   parent_path.parent)
                    combined_parents = _merge_dicts(config_parent,
                                                    combined_parents)
                cfg = _merge_dicts(cfg, combined_parents)
            return cfg

        # If parent configs are specified, inherit their contents
        config_user = handle_parents(config_user)
    elif config_dict:
        config_user = config_dict
    else:
        config_user = {}
    user_classes = config_user.get("classes", {})

    # Merge default and corpus config and save to global config variable
    global config
    config = _merge_dicts(copy.deepcopy(config_user), default_config)

    # Set correct classes and annotations from presets
    apply_presets(user_classes, default_classes)

    if config_file:
        handle_document_annotation()

    # Make sure that the root level only contains dictionaries or lists to save us a lot of headache
    for key in config:
        if key == PARENT:
            continue
        if not isinstance(config[key], (dict, list)):
            raise util.SparvErrorMessage(
                f"The config section '{key}' could not be parsed.",
                module="sparv",
                function="config")
示例#12
0
def preserved_format(
        doc: Document = Document(),
        text: Text = Text(),
        docid: AnnotationData = AnnotationData("<docid>"),
        out: Export = Export(
            "xml_preserved_format/[xml_export.filename_formatted]"),
        annotations: ExportAnnotations = ExportAnnotations(
            "xml_export.annotations"),
        source_annotations: SourceAnnotations = SourceAnnotations(
            "xml_export.source_annotations"),
        header_annotations: SourceAnnotations = SourceAnnotations(
            "xml_export.header_annotations"),
        remove_namespaces: bool = Config("export.remove_module_namespaces",
                                         False),
        sparv_namespace: str = Config("export.sparv_namespace"),
        source_namespace: str = Config("export.source_namespace"),
        include_empty_attributes: bool = Config(
            "xml_export.include_empty_attributes")):
    """Export annotations to XML in export_dir and keep whitespaces and indentation from original file.

    Args:
        doc: Name of the original document.
        text: The corpus text.
        docid: Annotation with document IDs.
        out: Path and filename pattern for resulting file.
        annotations: List of elements:attributes (annotations) to include.
        source_annotations: List of elements:attributes from the original document
            to be kept. If not specified, everything will be kept.
        header_annotations: List of header elements from the original document to include
            in the export. If not specified, all headers will be kept.
        remove_namespaces: Whether to remove module "namespaces" from element and attribute names.
            Disabled by default.
        sparv_namespace: The namespace to be added to all Sparv annotations.
        source_namespace: The namespace to be added to all annotations present in the source.
        include_empty_attributes: Whether to include attributes even when they are empty. Disabled by default.
    """
    # Create export dir
    os.makedirs(os.path.dirname(out), exist_ok=True)

    # Read corpus text and document ID
    corpus_text = text.read()
    docid = docid.read()

    # Get annotation spans, annotations list etc.
    annotation_list, _, export_names = util.get_annotation_names(
        annotations,
        source_annotations,
        doc=doc,
        remove_namespaces=remove_namespaces,
        sparv_namespace=sparv_namespace,
        source_namespace=source_namespace)
    h_annotations, h_export_names = util.get_header_names(header_annotations,
                                                          doc=doc)
    export_names.update(h_export_names)
    span_positions, annotation_dict = util.gather_annotations(
        annotation_list,
        export_names,
        h_annotations,
        doc=doc,
        flatten=False,
        split_overlaps=True)
    sorted_positions = [(pos, span[0], span[1])
                        for pos, spans in sorted(span_positions.items())
                        for span in spans]

    # Root tag sanity check
    if not xml_utils.valid_root(sorted_positions[0], sorted_positions[-1]):
        raise util.SparvErrorMessage(
            "Root tag is missing! If you have manually specified which elements to include, "
            "make sure to include an element that encloses all other included elements and "
            "text content.")

    # Create root node
    root_span = sorted_positions[0][2]
    root_span.set_node()
    node_stack = []
    last_pos = 0  # Keeps track of the position of the processed text

    for x, (_pos, instruction, span) in enumerate(sorted_positions):
        # Open node: Create child node under the top stack node
        if instruction == "open":
            # Set tail for previous node if necessary
            if last_pos < span.start:
                # Get last closing node in this position
                _, tail_span = [
                    i for i in span_positions[last_pos] if i[0] == "close"
                ][-1]
                tail_span.node.tail = corpus_text[last_pos:span.start]
                last_pos = span.start

            # Handle headers
            if span.is_header:
                header = annotation_dict[span.name][util.HEADER_CONTENTS][
                    span.index]
                header_xml = etree.fromstring(header)
                header_xml.tag = span.export  # Rename element if needed
                span.node = header_xml
                node_stack[-1].node.append(header_xml)
            else:
                if node_stack:  # Don't create root node, it already exists
                    span.set_node(parent_node=node_stack[-1].node)

                xml_utils.add_attrs(span.node, span.name, annotation_dict,
                                    export_names, span.index,
                                    include_empty_attributes)
                if span.overlap_id:
                    if sparv_namespace:
                        span.node.set(f"{sparv_namespace}.{util.OVERLAP_ATTR}",
                                      f"{docid}-{span.overlap_id}")
                    else:
                        span.node.set(
                            f"{util.SPARV_DEFAULT_NAMESPACE}.{util.OVERLAP_ATTR}",
                            f"{docid}-{span.overlap_id}")
                node_stack.append(span)

                # Set text if there should be any between this node and the next one
                next_item = sorted_positions[x + 1]
                if next_item[1] == "open" and next_item[2].start > span.start:
                    span.node.text = corpus_text[last_pos:next_item[2].start]
                    last_pos = next_item[2].start

        # Close node
        else:
            if span.is_header:
                continue
            if last_pos < span.end:
                # Set node text if necessary
                if span.start == last_pos:
                    span.node.text = corpus_text[last_pos:span.end]
                # Set tail for previous node if necessary
                else:
                    # Get last closing node in this position
                    _, tail_span = [
                        i for i in span_positions[last_pos] if i[0] == "close"
                    ][-1]
                    tail_span.node.tail = corpus_text[last_pos:span.end]
                last_pos = span.end

            # Make sure closing node == top stack node
            assert span == node_stack[
                -1], "Overlapping elements found: {}".format(node_stack[-2:])
            # Pop stack and move on to next span
            node_stack.pop()

    # Write xml to file
    etree.ElementTree(root_span.node).write(out,
                                            encoding="unicode",
                                            method="xml",
                                            xml_declaration=True)
    log.info("Exported: %s", out)
示例#13
0
    def __init__(self,
                 elements: list,
                 skip: list,
                 header_elements: list,
                 headers: list,
                 encoding: str = util.UTF8,
                 source_dir: str = "src",
                 prefix: str = "",
                 keep_control_chars: bool = True,
                 normalize: str = "NFC"):
        """Initialize XML parser."""
        self.source_dir = source_dir
        self.encoding = encoding
        self.keep_control_chars = keep_control_chars
        self.normalize = normalize
        self.doc = None
        self.prefix = prefix
        self.header_elements = header_elements
        self.headers = {}

        self.pos = 0  # Current position in the text data
        self.subpos = 0  # Sub-position for tags with same position
        self.tagstack = []
        self.targets = {
        }  # Index of elements and attributes that will be renamed during import
        self.data = {}  # Metadata collected during parsing
        self.text = []  # Text data of the document collected during parsing

        # Parse elements argument

        def elsplit(elem):
            """Split element and attribute."""
            elem = elem.replace(r"\:", ";")
            tag, _, attr = elem.partition(":")
            tag = tag.replace(";", ":")
            attr = attr.replace(";", ":")
            return tag, attr

        all_elems = set()
        renames = {}
        # Element list needs to be sorted to handle plain elements before attributes
        for element, target in sorted(util.parse_annotation_list(elements)):
            element, attr = elsplit(element)
            all_elems.add((element, attr))

            if target:
                # Element and/or attribute should be renamed during import
                if not attr:
                    renames[element] = target
                    target_element = target
                    target_attr = ""
                else:
                    target_element = renames.get(element, element)
                    target_attr = target
                self.targets.setdefault(element, {"attrs": {}})
                self.targets[element]["target"] = target_element
                self.data.setdefault(target_element, {
                    "attrs": set(),
                    "elements": []
                })
                if target_attr:
                    self.targets[element]["attrs"][attr] = target_attr
                    self.data[target_element]["attrs"].add(target_attr)
            else:
                self.data.setdefault(element, {"attrs": set(), "elements": []})
                if attr:
                    self.data[element]["attrs"].add(attr)

        for header in headers:
            header_source, _, header_target = header.partition(" as ")
            if not header_target:
                raise util.SparvErrorMessage(
                    "The header '{}' needs to be bound to a target element.".
                    format(header))
            header_source, _, header_source_attrib = header_source.partition(
                ":")
            header_source_root, _, header_source_rest = header_source.partition(
                "/")
            self.headers.setdefault(header_source_root, {})
            self.headers[header_source_root].setdefault(header_source_rest, [])
            self.headers[header_source_root][header_source_rest].append({
                "source":
                header_source_attrib,
                "target":
                elsplit(header_target)
            })

        self.skipped_elems = set(elsplit(elem) for elem in skip)
        assert self.skipped_elems.isdisjoint(
            all_elems), "skip and elements must be disjoint"
示例#14
0
def make_pretty_xml(span_positions,
                    annotation_dict,
                    export_names,
                    token_name: str,
                    word_annotation,
                    docid,
                    include_empty_attributes: bool,
                    sparv_namespace: Optional[str] = None):
    """Create a pretty formatted XML string from span_positions.

    Used by pretty and sentence_scrambled.
    """
    # Root tag sanity check
    if not valid_root(span_positions[0], span_positions[-1]):
        raise util.SparvErrorMessage(
            "Root tag is missing! If you have manually specified which elements to include, "
            "make sure to include an element that encloses all other included elements and "
            "text content.")

    # Create root node
    root_span = span_positions[0][2]
    root_span.set_node()
    add_attrs(root_span.node, root_span.name, annotation_dict, export_names, 0,
              include_empty_attributes)
    node_stack = [root_span]

    last_start_pos = None
    last_end_pos = -1
    current_token_text = None
    last_node = None
    inside_token = False

    def handle_subtoken_text(position, last_start_position, last_end_position,
                             node, token_text):
        """Handle text for subtoken elements."""
        if last_start_position < last_end_position < position:
            node.tail = token_text[:position - last_end_position]
            token_text = token_text[position - last_end_position:]
        elif position > last_start_position:
            node.text = token_text[:position - last_start_position]
            token_text = token_text[position - last_start_position:]
        return token_text

    # Go through span_positions and build xml tree
    for _pos, instruction, span in span_positions[1:]:
        # Handle headers
        if span.is_header:
            if instruction == "open":
                header = annotation_dict[span.name][util.HEADER_CONTENTS][
                    span.index]
                # Replace any leading tabs with spaces
                header = re.sub(r"^\t+",
                                lambda m: INDENTATION * len(m.group()),
                                header,
                                flags=re.MULTILINE)
                header_xml = etree.fromstring(header)
                header_xml.tag = span.export  # Rename element if needed
                node_stack[-1].node.append(header_xml)
            continue

        # Create child node under the top stack node
        if instruction == "open":
            span.set_node(parent_node=node_stack[-1].node)
            node_stack.append(span)
            add_attrs(span.node, span.name, annotation_dict, export_names,
                      span.index, include_empty_attributes)
            if span.overlap_id:
                if sparv_namespace:
                    span.node.set(f"{sparv_namespace}.{util.OVERLAP_ATTR}",
                                  f"{docid}-{span.overlap_id}")
                else:
                    span.node.set(
                        f"{util.SPARV_DEFAULT_NAMESPACE}.{util.OVERLAP_ATTR}",
                        f"{docid}-{span.overlap_id}")

            # Add text if this node is a token
            if span.name == token_name:
                inside_token = True
                # Save text until later
                last_start_pos = span.start
                current_token_text = word_annotation[span.index]

            if inside_token and current_token_text:
                current_token_text = handle_subtoken_text(
                    span.start, last_start_pos, last_end_pos, last_node,
                    current_token_text)
                last_start_pos = span.start
                last_node = span.node

        # Close node
        else:
            if inside_token and current_token_text:
                current_token_text = handle_subtoken_text(
                    span.end, last_start_pos, last_end_pos, last_node,
                    current_token_text)
                last_end_pos = span.end
                last_node = span.node
            if span.name == token_name:
                inside_token = False

            # Make sure closing node == top stack node
            assert span == node_stack[
                -1], "Overlapping elements found: {}".format(node_stack[-2:])
            # Pop stack and move on to next span
            node_stack.pop()

    # Pretty formatting of XML tree
    indent(root_span.node)

    # We use write() instead of tostring() here to be able to get an XML declaration
    stream = io.StringIO()
    etree.ElementTree(root_span.node).write(stream,
                                            encoding="unicode",
                                            method="xml",
                                            xml_declaration=True)
    return stream.getvalue()
示例#15
0
def rule_helper(rule: RuleStorage,
                config: dict,
                storage: SnakeStorage,
                config_missing: bool = False,
                custom_rule_obj: Optional[dict] = None) -> bool:
    """
    Populate rule with Snakemake input, output and parameter list.

    Return True if a Snakemake rule should be created.

    Args:
        rule: Object containing snakemake rule parameters.
        config: Dictionary containing the corpus configuration.
        storage: Object for saving information for all rules.
        config_missing: True if there is no corpus config file.
        custom_rule_obj: Custom annotation dictionary from corpus config.
    """
    # Only create certain rules when config is missing
    if config_missing and not rule.modelbuilder:
        return False

    # Skip any annotator that is not available for the selected corpus language
    if rule.annotator_info["language"] and sparv_config.get("metadata.language") and \
            sparv_config.get("metadata.language") not in rule.annotator_info["language"]:
        return False

    # Get this function's parameters
    params = OrderedDict(
        inspect.signature(rule.annotator_info["function"]).parameters)
    param_dict = make_param_dict(params)

    if rule.importer:
        rule.inputs.append(
            Path(get_source_path(), "{doc}." + rule.file_extension))
        storage.all_importers.setdefault(rule.module_name, {}).setdefault(
            rule.f_name, {
                "description": rule.description,
                "params": param_dict
            })
        if rule.target_name == sparv_config.get("import.importer"):
            # Exports always generate corpus text file
            rule.outputs.append(paths.work_dir / "{doc}" / io.TEXT_FILE)
            # If importer guarantees other outputs, add them to outputs list
            if rule.import_outputs:
                if isinstance(rule.import_outputs, Config):
                    rule.import_outputs = sparv_config.get(
                        rule.import_outputs, rule.import_outputs.default)
                annotations_ = set()
                renames = {}
                # Annotation list needs to be sorted to handle plain annotations before attributes
                for ann, target in sorted(
                        util.parse_annotation_list(rule.import_outputs)):
                    # Handle annotations renamed during import
                    if target:
                        source_ann, source_attr = BaseAnnotation(ann).split()
                        if not source_attr:
                            renames[ann] = target
                            ann = target
                        else:
                            ann = io.join_annotation(
                                renames.get(source_ann, source_ann), target)
                    annotations_.add(ann)

                for element in annotations_:
                    rule.outputs.append(paths.work_dir /
                                        get_annotation_path(element))

            # If import.document_annotation has been specified, add it to outputs if not already there
            if sparv_config.get("import.document_annotation"):
                doc_ann_file = paths.work_dir / get_annotation_path(
                    sparv_config.get("import.document_annotation"))
                if doc_ann_file not in rule.outputs:
                    rule.outputs.append(doc_ann_file)

    if rule.exporter:
        storage.all_exporters.setdefault(rule.module_name, {}).setdefault(
            rule.f_name, {
                "description": rule.description,
                "params": param_dict
            })
    elif rule.installer:
        storage.all_installers.setdefault(rule.module_name, {}).setdefault(
            rule.f_name, {
                "description": rule.description,
                "params": param_dict
            })

    output_dirs = set()  # Directories where export files are stored
    custom_params = set()

    if custom_rule_obj:
        if custom_rule_obj.get("params"):
            name_custom_rule(rule, storage)
            custom_params = set(custom_rule_obj.get("params").keys())
        else:
            # This rule has already been populated, so don't process it again
            return False

    # Go though function parameters and handle based on type
    for param_name, param in params.items():
        param_default_empty = param.default == inspect.Parameter.empty
        param_value: Any

        # Get parameter value, either from custom rule object or default value
        if custom_rule_obj:
            if param_name in custom_rule_obj["params"]:
                param_value = custom_rule_obj["params"][param_name]
                custom_params.remove(param_name)
            elif not param_default_empty:
                param_value = copy.deepcopy(param.default)
            else:
                raise util.SparvErrorMessage(
                    f"Parameter '{param_name}' in custom rule '{rule.full_name}' has no value!",
                    "sparv", "config")
        else:
            if param_default_empty:
                # This is probably an unused custom rule, so don't process it any further,
                # but save it in all_custom_annotators and all_annotators
                storage.all_custom_annotators.setdefault(
                    rule.module_name, {}).setdefault(rule.f_name, {
                        "description": rule.description,
                        "params": param_dict
                    })
                storage.custom_targets.append(
                    (rule.target_name, rule.description))
                storage.all_annotators.setdefault(
                    rule.module_name, {}).setdefault(
                        rule.f_name, {
                            "description": rule.description,
                            "annotations": [],
                            "params": param_dict
                        })
                return False
            else:
                param_value = copy.deepcopy(param.default)

        param_type, param_list, param_optional = registry.get_type_hint_type(
            param.annotation)

        # Output
        if issubclass(param_type, BaseOutput):
            if not isinstance(param_value, BaseOutput):
                if not param_value:
                    return False
                param_value = param_type(param_value)
            rule.configs.update(
                registry.find_config_variables(param_value.name))
            rule.classes.update(registry.find_classes(param_value.name))
            missing_configs = param_value.expand_variables(rule.full_name)
            rule.missing_config.update(missing_configs)
            ann_path = get_annotation_path(param_value,
                                           data=param_type.data,
                                           common=param_type.common)
            if param_type.all_docs:
                rule.outputs.extend(
                    map(
                        Path,
                        expand(escape_wildcards(paths.work_dir / ann_path),
                               doc=get_source_files(storage.source_files))))
            elif param_type.common:
                rule.outputs.append(paths.work_dir / ann_path)
                if rule.installer:
                    storage.install_outputs[rule.target_name].append(
                        paths.work_dir / ann_path)
            else:
                rule.outputs.append(
                    get_annotation_path(param_value, data=param_type.data))
            rule.parameters[param_name] = param_value
            if "{" in param_value:
                rule.wildcard_annotations.append(param_name)
            if rule.annotator:
                storage.all_annotators.setdefault(
                    rule.module_name, {}).setdefault(
                        rule.f_name, {
                            "description": rule.description,
                            "annotations": [],
                            "params": param_dict
                        })
                storage.all_annotators[rule.module_name][
                    rule.f_name]["annotations"].append(
                        (param_value, param_value.description))
        # ModelOutput
        elif param_type == ModelOutput:
            rule.configs.update(
                registry.find_config_variables(param_value.name))
            rule.classes.update(registry.find_classes(param_value.name))
            rule.missing_config.update(
                param_value.expand_variables(rule.full_name))
            model_path = param_value.path
            rule.outputs.append(model_path)
            rule.parameters[param_name] = ModelOutput(str(model_path))
            storage.model_outputs.append(model_path)
        # Annotation
        elif issubclass(param_type, BaseAnnotation):
            if not isinstance(param_value, BaseAnnotation):
                if not param_value:
                    return False
                param_value = param_type(param_value)
            rule.configs.update(
                registry.find_config_variables(param_value.name))
            rule.classes.update(registry.find_classes(param_value.name))
            missing_configs = param_value.expand_variables(rule.full_name)
            if (not param_value or missing_configs) and param_optional:
                rule.parameters[param_name] = None
                continue
            rule.missing_config.update(missing_configs)
            ann_path = get_annotation_path(param_value,
                                           data=param_type.data,
                                           common=param_type.common)
            if param_type.all_docs:
                rule.inputs.extend(
                    expand(escape_wildcards(paths.work_dir / ann_path),
                           doc=get_source_files(storage.source_files)))
            elif rule.exporter or rule.installer or param_type.common:
                rule.inputs.append(paths.work_dir / ann_path)
            else:
                rule.inputs.append(ann_path)

            rule.parameters[param_name] = param_value
            if "{" in param_value:
                rule.wildcard_annotations.append(param_name)
        # ExportAnnotations
        elif param_type in (ExportAnnotations, ExportAnnotationsAllDocs):
            if not isinstance(param_value, param_type):
                param_value = param_type(param_value)
            rule.parameters[param_name] = param_value

            source = param.default.config_name
            annotations = sparv_config.get(f"{source}", [])
            if not annotations:
                rule.missing_config.add(f"{source}")
            export_annotations = util.parse_annotation_list(
                annotations, add_plain_annotations=False)
            annotation_type = Annotation if param_type == ExportAnnotations else AnnotationAllDocs
            plain_annotations = set()
            possible_plain_annotations = []
            for i, (export_annotation_name,
                    export_name) in enumerate(export_annotations):
                annotation = annotation_type(export_annotation_name)
                rule.configs.update(
                    registry.find_config_variables(annotation.name))
                rule.classes.update(registry.find_classes(annotation.name))
                rule.missing_config.update(
                    annotation.expand_variables(rule.full_name))
                export_annotations[i] = (annotation, export_name)
                plain_name, attr = annotation.split()
                if not attr:
                    plain_annotations.add(plain_name)
                else:
                    if plain_name not in possible_plain_annotations:
                        possible_plain_annotations.append(plain_name)
            # Add plain annotations where needed
            for a in possible_plain_annotations:
                if a not in plain_annotations:
                    export_annotations.append((annotation_type(a), None))

            for annotation, export_name in export_annotations:
                if param.default.is_input:
                    if param_type == ExportAnnotationsAllDocs:
                        rule.inputs.extend(
                            expand(escape_wildcards(
                                paths.work_dir /
                                get_annotation_path(annotation.name)),
                                   doc=get_source_files(storage.source_files)))
                    else:
                        rule.inputs.append(
                            paths.work_dir /
                            get_annotation_path(annotation.name))
                rule.parameters[param_name].append((annotation, export_name))
        # SourceAnnotations
        elif param_type == SourceAnnotations:
            rule.parameters[param_name] = sparv_config.get(
                f"{param.default.config_name}", None)
        # Corpus
        elif param.annotation == Corpus:
            rule.parameters[param_name] = Corpus(
                sparv_config.get("metadata.id"))
        # Language
        elif param.annotation == Language:
            rule.parameters[param_name] = Language(
                sparv_config.get("metadata.language"))
        # Document
        elif param.annotation == Document:
            rule.docs.append(param_name)
        # AllDocuments (all source documents)
        elif param_type == AllDocuments:
            rule.parameters[param_name] = AllDocuments(
                get_source_files(storage.source_files))
        # Text
        elif param_type == Text:
            text_path = Path("{doc}") / io.TEXT_FILE
            if rule.exporter or rule.installer:
                rule.inputs.append(paths.work_dir / text_path)
            else:
                rule.inputs.append(text_path)
            rule.parameters[param_name] = param_value
        # Model
        elif param_type == Model:
            if param_value is not None:
                if param_list:
                    rule.parameters[param_name] = []
                    for model in param_value:
                        if not isinstance(model, Model):
                            model = Model(param_value)
                        rule.configs.update(
                            registry.find_config_variables(model.name))
                        rule.classes.update(registry.find_classes(model.name))
                        rule.missing_config.update(
                            model.expand_variables(rule.full_name))
                        rule.inputs.append(model.path)
                        rule.parameters[param_name].append(
                            Model(str(model.path)))
                else:
                    if not isinstance(param_value, Model):
                        param_value = Model(param_value)
                    rule.configs.update(
                        registry.find_config_variables(param_value.name))
                    rule.classes.update(registry.find_classes(
                        param_value.name))
                    rule.missing_config.update(
                        param_value.expand_variables(rule.full_name))
                    rule.inputs.append(param_value.path)
                    rule.parameters[param_name] = Model(str(param_value.path))
        # Binary
        elif param.annotation in (Binary, BinaryDir):
            rule.configs.update(registry.find_config_variables(param.default))
            rule.classes.update(registry.find_classes(param.default))
            param_value, missing_configs = registry.expand_variables(
                param.default, rule.full_name)
            rule.missing_config.update(missing_configs)
            binary = util.find_binary(param_value,
                                      executable=False,
                                      allow_dir=param.annotation == BinaryDir)
            if not binary:
                rule.missing_binaries.add(param_value)
            binary = Path(binary if binary else param_value)
            rule.inputs.append(binary)
            rule.parameters[param_name] = param.annotation(binary)
        # Source
        elif param.annotation == Source:
            rule.parameters[param_name] = Source(get_source_path())
        # Export
        elif param.annotation == Export:
            rule.configs.update(registry.find_config_variables(param.default))
            rule.classes.update(registry.find_classes(param.default))
            param_value, missing_configs = registry.expand_variables(
                param.default, rule.full_name)
            rule.missing_config.update(missing_configs)
            if param.default.absolute_path:
                export_path = Path(param_value)
            else:
                export_path = paths.export_dir / param_value
            output_dirs.add(export_path.parent)
            rule.outputs.append(export_path)
            rule.parameters[param_name] = Export(str(export_path))
            if "{doc}" in rule.parameters[param_name]:
                rule.doc_annotations.append(param_name)
            if "{" in param_value:
                rule.wildcard_annotations.append(param_name)
        # ExportInput
        elif param.annotation == ExportInput:
            rule.configs.update(registry.find_config_variables(param.default))
            rule.classes.update(registry.find_classes(param.default))
            param_value, missing_configs = registry.expand_variables(
                param.default, rule.full_name)
            rule.missing_config.update(missing_configs)
            if param.default.absolute_path:
                rule.parameters[param_name] = ExportInput(param_value)
            else:
                rule.parameters[param_name] = ExportInput(paths.export_dir /
                                                          param_value)
            if param.default.all_docs:
                rule.inputs.extend(
                    expand(escape_wildcards(rule.parameters[param_name]),
                           doc=get_source_files(storage.source_files)))
            else:
                rule.inputs.append(Path(rule.parameters[param_name]))
            if "{" in rule.parameters[param_name]:
                rule.wildcard_annotations.append(param_name)
        # Config
        elif isinstance(param_value, Config):
            rule.configs.add(param_value.name)
            config_value = sparv_config.get(param_value, sparv_config.Unset)
            if config_value is sparv_config.Unset:
                if param_value.default is not None:
                    config_value = param_value.default
                elif param_optional:
                    config_value = None
                else:
                    rule.missing_config.add(param_value)
            rule.parameters[param_name] = config_value
        # Everything else
        else:
            rule.parameters[param_name] = param_value

    # For custom rules, warn the user of any unknown parameters
    if custom_params:
        print_sparv_warning(
            "The parameter{} '{}' used in one of your custom rules "
            "do{} not exist in {}.".format(
                "s" if len(custom_params) > 1 else "",
                "', '".join(custom_params),
                "es" if len(custom_params) == 1 else "", rule.full_name))

    storage.all_rules.append(rule)

    # Add to rule lists in storage
    update_storage(storage, rule)

    # Add exporter dirs (used for informing user)
    if rule.exporter:
        if rule.abstract:
            output_dirs = set([p.parent for p in rule.inputs])
        rule.export_dirs = [str(p / "_")[:-1] for p in output_dirs]

    if rule.missing_config:
        log_handler.messages["missing_configs"][rule.full_name].update(
            [c for c in rule.missing_config if not c.startswith("<")])
        log_handler.messages["missing_classes"][rule.full_name].update(
            [c[1:-1] for c in rule.missing_config if c.startswith("<")])

    if rule.missing_binaries:
        log_handler.messages["missing_binaries"][rule.full_name].update(
            rule.missing_binaries)

    if config.get("debug"):
        print()
        console.print("[b]{}:[/b] {}".format(rule.module_name.upper(),
                                             rule.f_name))
        print()
        console.print("    [b]INPUTS[/b]")
        for i in rule.inputs:
            print("        {}".format(i))
        print()
        console.print("    [b]OUTPUTS[/b]")
        for o in rule.outputs:
            print("        {}".format(o))
        print()
        console.print("    [b]PARAMETERS[/b]")
        for p in rule.parameters:
            print("        {} = {!r}".format(p, rule.parameters[p]))
        print()
        print()

    return True