def uppercase(
    word: Annotation = Annotation("<token:word>"),
    out: Output = Output("<token>:uppercase.upper"),
    # some_config_variable: str = Config("uppercase.some_setting")
):
    """Convert to uppercase."""
    out.write([val.upper() for val in word.read()])
Пример #2
0
def diapivot_annotate(
        out: Output = Output(
            "<token>:hist.diapivot",
            description="SALDO IDs corresponding to lemgrams"),
        lemgram: Annotation = Annotation("<token>:saldo.lemgram"),
        model: Model = Model("hist/diapivot.pickle")):
    """Annotate each lemgram with its corresponding saldo_id according to model.

    Args:
        out (str, optional): Resulting annotation file.
            Defaults to Output("<token>:hist.diapivot", description="SALDO IDs corresponding to lemgrams").
        lemgram (str, optional): Existing lemgram annotation. Defaults to Annotation("<token>:saldo.lemgram").
        model (str, optional): Crosslink model. Defaults to Model("hist/diapivot.pickle").
    """
    lexicon = PivotLexicon(model)
    lemgram_annotation = list(lemgram.read())

    out_annotation = []

    for lemgrams in lemgram_annotation:
        saldo_ids = []
        for lemgram in lemgrams.split(util.DELIM):
            s_i = lexicon.get_exactMatch(lemgram)
            if s_i:
                saldo_ids += [s_i]
        out_annotation.append(util.AFFIX + util.DELIM.join(set(saldo_ids)) +
                              util.AFFIX if saldo_ids else util.AFFIX)

    out.write(out_annotation)
Пример #3
0
def lix(text: Annotation = Annotation("<text>"),
        sentence: Annotation = Annotation("<sentence>"),
        word: Annotation = Annotation("<token:word>"),
        pos: Annotation = Annotation("<token:pos>"),
        out: Output = Output("<text>:readability.lix",
                             description="LIX values for text chunks"),
        skip_pos: List[str] = ["MAD", "MID", "PAD"],
        fmt: str = "%.2f"):
    """Create LIX annotation for text."""
    # Read annotation files and get parent_children relations
    text_children, _orphans = text.get_children(sentence)
    word_pos = list(word.read_attributes((word, pos)))
    sentence_children, _orphans = sentence.get_children(word)
    sentence_children = list(sentence_children)

    # Calculate LIX for every text element
    lix_annotation = []
    for text in text_children:
        in_sentences = []
        for sentence_index in text:
            s = sentence_children[sentence_index]
            in_sentences.append(
                list(
                    actual_words([word_pos[token_index] for token_index in s],
                                 skip_pos)))
        lix_annotation.append(fmt % lix_calc(in_sentences))

    out.write(lix_annotation)
Пример #4
0
def translate_tag(out: Output, tag: Annotation, mapping: dict = {}):
    """Convert part-of-speech tags, specified by the mapping.

    Example mappings: parole_to_suc, suc_to_simple, ...
    """
    if isinstance(mapping, str):
        mapping = util.tagsets.mappings[mapping]
    out.write((mapping.get(t, t) for t in tag.read()))
Пример #5
0
def find_replace_regex(chunk: Annotation,
                       out: Output,
                       find: str = "",
                       sub: str = ""):
    """Do find and replace in values of annotation using a regular expressions.

    N.B: When writing regular expressions in YAML they should be enclosed in single quotes.
    """
    out.write((re.sub(find, sub, val) for val in chunk.read()))
Пример #6
0
def concat2(out: Output,
            annotations: List[Annotation] = [Annotation],
            separator: str = ""):
    """Concatenate two or more annotations, with an optional separator."""
    annotations = [list(a.read()) for a in annotations]
    out.write([
        separator.join([a[n] for a in annotations])
        for (n, _) in enumerate(annotations[0])
    ])
Пример #7
0
def msdtag(out: Output = Output(
    "<token>:hunpos.msd",
    cls="token:msd",
    description="Part-of-speeches with morphological descriptions"),
           word: Annotation = Annotation("<token:word>"),
           sentence: Annotation = Annotation("<sentence>"),
           binary: Binary = Binary("[hunpos.binary]"),
           model: Model = Model("[hunpos.model]"),
           morphtable: Optional[Model] = Model("[hunpos.morphtable]"),
           patterns: Optional[Model] = Model("[hunpos.patterns]"),
           tag_mapping=None,
           encoding: str = util.UTF8):
    """POS/MSD tag using the Hunpos tagger."""
    if isinstance(tag_mapping, str) and tag_mapping:
        tag_mapping = util.tagsets.mappings[tag_mapping]
    elif tag_mapping is None or tag_mapping == "":
        tag_mapping = {}

    pattern_list = []

    if patterns:
        with open(patterns.path, encoding="utf-8") as pat:
            for line in pat:
                if line.strip() and not line.startswith("#"):
                    name, pattern, tags = line.strip().split("\t", 2)
                    pattern_list.append(
                        (name, re.compile("^%s$" % pattern), tags))

    def replace_word(w):
        """Replace word with alias if word matches a regex pattern."""
        for p in pattern_list:
            if re.match(p[1], w):
                return "[[%s]]" % p[0]
        return w

    sentences, _orphans = sentence.get_children(word)
    token_word = list(word.read())
    stdin = SENT_SEP.join(
        TOK_SEP.join(
            replace_word(token_word[token_index]) for token_index in sent)
        for sent in sentences)
    args = [model.path]
    if morphtable:
        args.extend(["-m", morphtable.path])
    stdout, _ = util.system.call_binary(binary, args, stdin, encoding=encoding)

    out_annotation = word.create_empty_attribute()
    for sent, tagged_sent in zip(sentences, stdout.strip().split(SENT_SEP)):
        for token_index, tagged_token in zip(
                sent,
                tagged_sent.strip().split(TOK_SEP)):
            tag = tagged_token.strip().split(TAG_SEP)[TAG_COLUMN]
            tag = tag_mapping.get(tag, tag)
            out_annotation[token_index] = tag

    out.write(out_annotation)
Пример #8
0
def override(chunk: Annotation, repl: Annotation, out: Output):
    """Replace values in 'chunk' with non empty values from 'repl'."""
    def empty(val):
        if not val:
            return True
        return val == "|"

    repl = list(repl.read())
    out.write((repl[n] if not empty(repl[n]) else val
               for (n, val) in enumerate(chunk.read())))
Пример #9
0
def struct_to_token(
        attr: Annotation = Annotation("{struct}:{attr}"),
        token: Annotation = Annotation("<token>"),
        out: Output = Output("<token>:misc.from_struct_{struct}_{attr}")):
    """Convert an attribute on a structural annotation into a token attribute."""
    token_parents = token.get_parents(attr)
    attr_values = list(attr.read())
    out_values = [
        attr_values[p] if p is not None else "" for p in token_parents
    ]
    out.write(out_values)
Пример #10
0
def select(out: Output,
           annotation: Annotation,
           index: Optional[int] = 0,
           separator: Optional[str] = " "):
    """Select a specific index from the values of an annotation.

    The given annotation values are separated by 'separator',
    by default whitespace, with at least index + 1 elements.
    """
    if isinstance(index, str):
        index = int(index)
    out.write(value.split(separator)[index] for value in annotation.read())
Пример #11
0
def upostag(out: Output = Output("<token>:misc.upos",
                                 cls="token:upos",
                                 description="Part-of-speeches in UD"),
            pos: Annotation = Annotation("<token:pos>")):
    """Convert SUC POS tags to UPOS."""
    pos_tags = pos.read()
    out_annotation = []

    for tag in pos_tags:
        out_annotation.append(util.tagsets.pos_to_upos(tag, "swe", "SUC"))

    out.write(out_annotation)
Пример #12
0
    def save(self):
        """Save text data and annotation files to disk."""
        text = unicodedata.normalize("NFC", "".join(self.text))
        Text(self.doc).write(text)
        structure = []
        header_elements = []

        for element in self.data:
            is_header = False
            spans = []
            attributes = {attr: [] for attr in self.data[element]["attrs"]}
            for instance in self.data[element]["elements"]:
                start, start_subpos, end, end_subpos, _original_element, attrs = instance
                spans.append(((start, start_subpos), (end, end_subpos)))
                for attr in attributes:
                    attributes[attr].append(attrs.get(attr, ""))

            full_element = "{}.{}".format(self.prefix,
                                          element) if self.prefix else element

            if element in self.header_elements:
                is_header = True
                header_elements.append(full_element)
            else:
                structure.append(full_element)

            # Sort spans and annotations by span position (required by Sparv)
            if attributes:
                attr_names, attr_values = list(zip(*attributes.items()))
                spans, *attr_values = list(
                    zip(*sorted(zip(spans, *attr_values), key=lambda x: x[0])))
                attributes = dict(zip(attr_names, attr_values))
            else:
                spans.sort()

            Output(full_element, doc=self.doc).write(spans)

            for attr in attributes:
                full_attr = "{}.{}".format(self.prefix,
                                           attr) if self.prefix else attr
                Output("{}:{}".format(full_element, full_attr),
                       doc=self.doc).write(attributes[attr],
                                           allow_newlines=is_header)
                if element not in self.header_elements:
                    structure.append("{}:{}".format(full_element, full_attr))

        # Save list of all elements and attributes to a file (needed for export)
        SourceStructure(self.doc).write(structure)

        if header_elements:
            # Save list of all header elements to a file
            Headers(self.doc).write(header_elements)
Пример #13
0
def concat(out: Output,
           left: Annotation,
           right: Annotation,
           separator: str = "",
           merge_twins: bool = False):
    """Concatenate values from two annotations, with an optional separator.

    If merge_twins is set to True, no concatenation will be done on identical values.
    """
    b = list(right.read())
    out.write((f"{val_a}{separator}{b[n]}"
               if not (merge_twins and val_a == b[n]) else val_a
               for (n, val_a) in enumerate(left.read())))
Пример #14
0
def contextual(out: Output = Output("{chunk}:geo.geo_context", description="Geographical places with coordinates"),
               chunk: Annotation = Annotation("{chunk}"),
               context: Annotation = Annotation("[geo.context_chunk]"),
               ne_type: Annotation = Annotation("swener.ne:swener.type"),
               ne_subtype: Annotation = Annotation("swener.ne:swener.subtype"),
               ne_name: Annotation = Annotation("swener.ne:swener.name"),
               model: Model = Model("[geo.model]"),
               method: str = "populous",
               language: list = []):
    """Annotate chunks with location data, based on locations contained within the text.

    context = text chunk to use for disambiguating places (when applicable).
    chunk = text chunk to which the annotation will be added.
    """
    model = load_model(model, language=language)

    ne_type_annotation = list(ne_type.read())
    ne_subtype_annotation = list(ne_subtype.read())
    ne_name_annotation = list(ne_name.read())

    children_context_chunk, _orphans = context.get_children(chunk)
    children_chunk_ne, _orphans = chunk.get_children(ne_type)

    out_annotation = chunk.create_empty_attribute()

    for chunks in children_context_chunk:
        all_locations = []  # TODO: Maybe not needed for anything?
        context_locations = []
        chunk_locations = defaultdict(list)

        for ch in chunks:
            for n in children_chunk_ne[ch]:
                if ne_type_annotation[n] == "LOC" and "PPL" in ne_subtype_annotation[n]:
                    location_text = ne_name_annotation[n].replace("\n", " ").replace("  ", " ")
                    location_data = model.get(location_text.lower())
                    if location_data:
                        all_locations.append((location_text, list(location_data)))
                        context_locations.append((location_text, list(location_data)))
                        chunk_locations[ch].append((location_text, list(location_data)))
                    else:
                        pass
                        # log.info("No location found for %s" % ne_name_annotation[n].replace("%", "%%"))

        chunk_locations = most_populous(chunk_locations)

        for c in chunks:
            out_annotation[c] = _format_location(chunk_locations.get(c, ()))

    out.write(out_annotation)
Пример #15
0
def number_relative(out: Output = Output("{annotation}:misc.number_rel_{parent}"),
                    parent: Annotation = Annotation("{parent}"),
                    child: Annotation = Annotation("{annotation}"),
                    prefix: str = "",
                    zfill: bool = False,
                    start: int = START_DEFAULT):
    """Number chunks by their relative position within a parent."""
    parent_children, _orphans = parent.get_children(child)

    out.write(("{prefix}{nr:0{length}d}".format(prefix=prefix,
                                                length=len(str(len(parent) - 1 + start))
                                                if zfill else 0,
                                                nr=cnr)
               for parent in parent_children
               for cnr, _index in enumerate(parent, start)))
Пример #16
0
def replace_list(chunk: Annotation,
                 out: Output,
                 find: str = "",
                 sub: str = ""):
    """Find and replace annotations.

    Find string must match whole annotation.
    find and sub are whitespace separated lists of words to replace and their replacement.
    """
    find = find.split()
    sub = sub.split()
    if len(find) != len(sub):
        raise util.SparvErrorMessage(
            "Find and sub must have the same number of words.")
    translate = dict((f, s) for (f, s) in zip(find, sub))
    out.write((translate.get(val, val) for val in chunk.read()))
Пример #17
0
def ufeatstag(out: Output = Output(
    "<token>:misc.ufeats",
    cls="token:ufeats",
    description="Universal morphological features"),
              pos: Annotation = Annotation("<token:pos>"),
              msd: Annotation = Annotation("<token:msd>")):
    """Convert SUC MSD tags to universal features."""
    pos_tags = pos.read()
    msd_tags = msd.read()
    out_annotation = []

    for pos_tag, msd_tag in zip(pos_tags, msd_tags):
        feats = util.tagsets.suc_to_feats(pos_tag, msd_tag)
        out_annotation.append(util.cwbset(feats))

    out.write(out_annotation)
Пример #18
0
def postag(out: Output = Output("<token>:hunpos.pos",
                                cls="token:pos",
                                description="Part-of-speech tags"),
           msd: Annotation = Annotation("<token>:hunpos.msd")):
    """Extract POS from MSD."""
    from sparv.modules.misc import misc
    misc.select(out, msd, index=0, separator=".")
Пример #19
0
def swefn_words(out: Output = Output("<token>:lexical_classes.swefn",
                                     description="Lexical classes for tokens from SweFN"),
                model: Model = Model("[lexical_classes.swefn_word_model]"),
                saldoids: Annotation = Annotation("<token:sense>"),
                pos: Annotation = Annotation("<token:pos>"),
                pos_limit: List[str] = ["NN", "VB", "JJ", "AB"],
                disambiguate: bool = True,
                connect_ids: bool = False,
                delimiter: str = util.DELIM,
                affix: str = util.AFFIX,
                scoresep: str = util.SCORESEP,
                lexicon=None):
    """Swefn specific wrapper for annotate_words. See annotate_words for more info."""

    # SweFN annotation function
    def annotate_swefn(saldo_ids, lexicon, connect_IDs=False, scoresep=util.SCORESEP):
        swefnid = set()
        if saldo_ids:
            for sid in saldo_ids:
                if connect_IDs:
                    swefnid = swefnid.union(set(i + scoresep + sid for i in lexicon.lookup(sid, default=set())))
                else:
                    swefnid = swefnid.union(lexicon.lookup(sid, default=set()))
        return sorted(swefnid)

    annotate_words(out, model, saldoids, pos, annotate_swefn, pos_limit=pos_limit, disambiguate=disambiguate,
                   connect_ids=connect_ids, delimiter=delimiter, affix=affix, scoresep=scoresep, lexicon=lexicon)
Пример #20
0
def blingbring_words(out: Output = Output("<token>:lexical_classes.blingbring",
                                          description="Lexical classes for tokens from Blingbring"),
                     model: Model = Model("[lexical_classes.bb_word_model]"),
                     saldoids: Annotation = Annotation("<token:sense>"),
                     pos: Annotation = Annotation("<token:pos>"),
                     pos_limit: List[str] = ["NN", "VB", "JJ", "AB"],
                     class_set: str = "bring",
                     disambiguate: bool = True,
                     connect_ids: bool = False,
                     delimiter: str = util.DELIM,
                     affix: str = util.AFFIX,
                     scoresep: str = util.SCORESEP,
                     lexicon=None):
    """Blingbring specific wrapper for annotate_words. See annotate_words for more info."""
    # pos_limit="NN VB JJ AB" | None

    if class_set not in ["bring", "roget_head", "roget_subsection", "roget_section", "roget_class"]:
        log.warning("Class '%s' not available. Fallback to 'bring'.")
        class_set = "bring"

    # Blingbring annotation function
    def annotate_bring(saldo_ids, lexicon, connect_IDs=False, scoresep=util.SCORESEP):
        rogetid = set()
        if saldo_ids:
            for sid in saldo_ids:
                if connect_IDs:
                    rogetid = rogetid.union(set(i + scoresep + sid for i in lexicon.lookup(sid, default=set())))
                else:
                    rogetid = rogetid.union(lexicon.lookup(sid, default=dict()).get(class_set, set()))
        return sorted(rogetid)

    annotate_words(out, model, saldoids, pos, annotate_bring, pos_limit=pos_limit, disambiguate=disambiguate,
                   class_set=class_set, connect_ids=connect_ids, delimiter=delimiter, affix=affix, scoresep=scoresep,
                   lexicon=lexicon)
Пример #21
0
def annotate(
        lang: Language = Language(),
        model: Model = Model("[treetagger.model]"),
        tt_binary: Binary = Binary("[treetagger.binary]"),
        out_upos: Output = Output("<token>:treetagger.upos",
                                  cls="token:upos",
                                  description="Part-of-speeches in UD"),
        out_pos: Output = Output(
            "<token>:treetagger.pos",
            cls="token:pos",
            description="Part-of-speeches from TreeTagger"),
        out_baseform: Output = Output("<token>:treetagger.baseform",
                                      description="Baseforms from TreeTagger"),
        word: Annotation = Annotation("<token:word>"),
        sentence: Annotation = Annotation("<sentence>"),
        encoding: str = util.UTF8):
    """POS/MSD tag and lemmatize using TreeTagger."""
    sentences, _orphans = sentence.get_children(word)
    word_annotation = list(word.read())
    stdin = SENT_SEP.join(
        TOK_SEP.join(word_annotation[token_index] for token_index in sent)
        for sent in sentences)
    args = ["-token", "-lemma", "-no-unknown", "-eos-tag", "<eos>", model.path]

    stdout, stderr = util.system.call_binary(tt_binary,
                                             args,
                                             stdin,
                                             encoding=encoding)
    log.debug("Message from TreeTagger:\n%s", stderr)

    # Write pos and upos annotations.
    out_upos_annotation = word.create_empty_attribute()
    out_pos_annotation = word.create_empty_attribute()
    for sent, tagged_sent in zip(sentences, stdout.strip().split(SENT_SEP)):
        for token_id, tagged_token in zip(sent,
                                          tagged_sent.strip().split(TOK_SEP)):
            tag = tagged_token.strip().split(TAG_SEP)[TAG_COLUMN]
            out_pos_annotation[token_id] = tag
            out_upos_annotation[token_id] = util.tagsets.pos_to_upos(
                tag, lang, TAG_SETS.get(lang))
    out_pos.write(out_pos_annotation)
    out_upos.write(out_upos_annotation)

    # Write lemma annotations.
    out_lemma_annotation = word.create_empty_attribute()
    for sent, tagged_sent in zip(sentences, stdout.strip().split(SENT_SEP)):
        for token_id, tagged_token in zip(sent,
                                          tagged_sent.strip().split(TOK_SEP)):
            lem = tagged_token.strip().split(TAG_SEP)[LEM_COLUMN]
            out_lemma_annotation[token_id] = lem
    out_baseform.write(out_lemma_annotation)
Пример #22
0
def process_output(word: Annotation, out: Output, stdout, in_sentences,
                   saldo_annotation, prob_format, default_prob):
    """Parse WSD output and write annotation."""
    out_annotation = word.create_empty_attribute()

    # Split output into sentences
    out_sentences = stdout.strip()
    out_sentences = out_sentences.split("\t".join(
        ["_", "_", "_", "_", SENT_SEP, "_", "_"]))
    out_sentences = [i for i in out_sentences if i]

    # Split output into tokens
    for out_sent, in_sent in zip(out_sentences, in_sentences):
        out_tokens = [t for t in out_sent.split("\n") if t]
        for (out_tok, in_tok) in zip(out_tokens, in_sent):
            out_prob = out_tok.split("\t")[6]
            out_prob = [i for i in out_prob.split("|") if i != "_"]
            out_meanings = [
                i for i in out_tok.split("\t")[5].split("|") if i != "_"
            ]
            saldo = [
                i for i in saldo_annotation[in_tok].strip(util.AFFIX).split(
                    util.DELIM) if i
            ]

            new_saldo = []
            if out_prob:
                for meaning in saldo:
                    if meaning in out_meanings:
                        i = out_meanings.index(meaning)
                        new_saldo.append((meaning, float(out_prob[i])))
                    else:
                        new_saldo.append((meaning, default_prob))
            else:
                new_saldo = [(meaning, default_prob) for meaning in saldo]

            # Sort by probability
            new_saldo.sort(key=lambda x: (-x[1], x[0]))
            # Format probability according to prob_format
            new_saldo = [
                saldo + prob_format % prob if prob_format else saldo
                for saldo, prob in new_saldo
            ]
            out_annotation[in_tok] = util.cwbset(new_saldo)

    out.write(out_annotation)
Пример #23
0
def ids(doc: Document = Document(),
        annotation: Annotation = Annotation("{annotation}"),
        out: Output = Output("{annotation}:misc.id",
                             description="Unique ID for {annotation}"),
        docid: AnnotationData = AnnotationData("<docid>"),
        prefix: str = ""):
    """Create unique IDs for every span of an existing annotation."""
    docid = docid.read()
    prefix = prefix + docid

    ann = list(annotation.read())
    out_annotation = []
    # Use doc name and annotation name as seed for the IDs
    _reset_id("{}/{}".format(doc, annotation), len(ann))
    for _ in ann:
        new_id = _make_id(prefix, out_annotation)
        out_annotation.append(new_id)
    out.write(out_annotation)
Пример #24
0
def nominal_ratio(text: Annotation = Annotation("<text>"),
                  pos: Annotation = Annotation("<token:pos>"),
                  out: Output = Output(
                      "<text>:readability.nk",
                      description="Nominal ratios for text chunks"),
                  noun_pos: List[str] = ["NN", "PP", "PC"],
                  verb_pos: List[str] = ["PN", "AB", "VB"],
                  fmt: str = "%.2f"):
    """Create nominal ratio annotation for text."""
    text_children, _orphans = text.get_children(pos)
    pos_annotation = list(pos.read())

    # Calculate OVIX for every text element
    nk_annotation = []
    for text in text_children:
        in_pos = [pos_annotation[token_index] for token_index in text]
        nk_annotation.append(fmt %
                             nominal_ratio_calc(in_pos, noun_pos, verb_pos))
    out.write(nk_annotation)
Пример #25
0
def number_by_attribute(out: Output = Output("{annotation}:misc.number_by_{attribute}"),
                        chunk: Annotation = Annotation("{annotation}:{attribute}"),
                        prefix: str = "",
                        zfill: bool = False,
                        start: int = START_DEFAULT):
    """Number chunks, with the order determined by an attribute."""
    def _order(_index, value):
        return _natural_sorting(value)

    _read_chunks_and_write_new_ordering(out, chunk, _order, prefix, zfill, start)
Пример #26
0
def ovix(text: Annotation = Annotation("<text>"),
         word: Annotation = Annotation("<token:word>"),
         pos: Annotation = Annotation("<token:pos>"),
         out: Output = Output("<text>:readability.ovix",
                              description="OVIX values for text chunks"),
         skip_pos: List[str] = ["MAD", "MID", "PAD"],
         fmt: str = "%.2f"):
    """Create OVIX annotation for text."""
    text_children, _orphans = text.get_children(word)
    word_pos = list(word.read_attributes((word, pos)))

    # Calculate OVIX for every text element
    ovix_annotation = []
    for text in text_children:
        in_words = list(
            actual_words([word_pos[token_index] for token_index in text],
                         skip_pos))
        ovix_annotation.append(fmt % ovix_calc(in_words))

    out.write(ovix_annotation)
Пример #27
0
def metadata(out: Output = Output("{chunk}:geo.geo_metadata", description="Geographical places with coordinates"),
             chunk: Annotation = Annotation("{chunk}"),
             source: Annotation = Annotation("[geo.metadata_source]"),
             model: Model = Model("[geo.model]"),
             method: str = "populous",
             language: list = []):
    """Get location data based on metadata containing location names."""
    geomodel = load_model(model, language=language)

    same_target_source = chunk.split()[0] == source.split()[0]
    chunk_annotation = list(chunk.read())
    source_annotation = list(source.read())

    # If location source and target chunk are not the same, we need
    # to find the parent/child relations between them.
    if not same_target_source:
        target_source_parents = list(source.get_parents(chunk))

    chunk_locations = {}

    for i, _ in enumerate(chunk_annotation):
        if same_target_source:
            location_source = source_annotation[i]
        else:
            location_source = source_annotation[target_source_parents[i]] if target_source_parents[
                i] is not None else None

        if location_source:
            location_data = geomodel.get(location_source.strip().lower())
            if location_data:
                chunk_locations[i] = [(location_source, list(location_data))]
        else:
            chunk_locations[i] = []

    chunk_locations = most_populous(chunk_locations)

    out_annotation = chunk.create_empty_attribute()
    for c in chunk_locations:
        out_annotation[c] = _format_location(chunk_locations.get(c, ()))

    out.write(out_annotation)
Пример #28
0
def _read_chunks_and_write_new_ordering(out: Output, chunk: Annotation, order, prefix="", zfill=False,
                                        start=START_DEFAULT):
    """Common function called by other numbering functions."""
    new_order = defaultdict(list)

    in_annotation = list(chunk.read())

    for i, val in enumerate(in_annotation):
        val = order(i, val)
        new_order[val].append(i)

    out_annotation = chunk.create_empty_attribute()

    nr_digits = len(str(len(new_order) - 1 + start))
    for nr, key in enumerate(sorted(new_order), start):
        for index in new_order[key]:
            out_annotation[index] = "{prefix}{nr:0{length}d}".format(prefix=prefix,
                                                                     length=nr_digits if zfill else 0,
                                                                     nr=nr)

    out.write(out_annotation)
Пример #29
0
def number_by_position(out: Output = Output("{annotation}:misc.number_position"),
                       chunk: Annotation = Annotation("{annotation}"),
                       prefix: str = "",
                       zfill: bool = False,
                       start: int = START_DEFAULT):
    """Number chunks by their position."""
    spans = list(chunk.read_spans())

    def _order(index, _value):
        return spans[index]

    _read_chunks_and_write_new_ordering(out, chunk, _order, prefix, zfill, start)
Пример #30
0
def dateformat(
        in_from: Annotation = Annotation("[dateformat.datetime_from]"),
        in_to: Optional[Annotation] = Annotation("[dateformat.datetime_to]"),
        out_from: Output = Output(
            "[dateformat.out_annotation]:dateformat.datefrom",
            description="From-dates"),
        out_to: Optional[Output] = Output(
            "[dateformat.out_annotation]:dateformat.dateto",
            description="To-dates"),
        informat: str = Config("dateformat.datetime_informat"),
        outformat: str = Config("dateformat.date_outformat"),
        splitter: Optional[str] = Config("dateformat.splitter", None),
        regex: Optional[str] = Config("dateformat.regex", None)):
    """Convert existing dates/times to specified date output format.

    http://docs.python.org/library/datetime.html#strftime-and-strptime-behavior

    Args:
        in_from (str, optional): Annotation containing from-dates (and times).
            Defaults to Annotation("[dateformat.datetime_from]").
        in_to (Optional[str], optional): Annotation containing to-dates.
            Defaults to Annotation("[dateformat.datetime_to]").
        out_from (str, optional): Annotation with from-times to be written.
            Defaults to Output("[dateformat.out_annotation]:dateformat.datefrom",description="From-dates").
        out_to (Optional[str], optional): Annotation with to-times to be written.
            Defaults to Output("[dateformat.out_annotation]:dateformat.dateto",description="To-dates").
        informat (str, optional): Format of the in_from and in_to dates/times.
            Several formats can be specified separated by |. They will be tried in order.
            Defaults to Config("dateformat.datetime_informat").
        outformat (str, optional): Desired format of the out_from and out_to dates.
            Several formats can be specified separated by |. They will be tied to their respective in-format.
            Defaults to Config("dateformat.date_outformat", "%Y%m%d").
        splitter (str, optional): One or more characters separating two dates in 'in_from',
            treating them as from-date and to-date. Defaults to Config("dateformat.splitter", None).
        regex (str, optional): Regular expression with a catching group whose content will be used in the parsing
            instead of the whole string. Defaults to Config("dateformat.regex", None).
    """
    _formatter(in_from, in_to, out_from, out_to, informat, outformat, splitter,
               regex)