示例#1
0
def override(chunk: Annotation, repl: Annotation, out: Output):
    """Replace values in 'chunk' with non empty values from 'repl'."""
    def empty(val):
        if not val:
            return True
        return val == "|"

    repl = list(repl.read())
    out.write((repl[n] if not empty(repl[n]) else val
               for (n, val) in enumerate(chunk.read())))
示例#2
0
def concat(out: Output,
           left: Annotation,
           right: Annotation,
           separator: str = "",
           merge_twins: bool = False):
    """Concatenate values from two annotations, with an optional separator.

    If merge_twins is set to True, no concatenation will be done on identical values.
    """
    b = list(right.read())
    out.write((f"{val_a}{separator}{b[n]}"
               if not (merge_twins and val_a == b[n]) else val_a
               for (n, val_a) in enumerate(left.read())))
示例#3
0
def vrt_scrambled(
        doc: Document = Document(),
        out: Export = Export("vrt_scrambled/{doc}.vrt"),
        chunk: Annotation = Annotation("[cwb.scramble_on]"),
        chunk_order: Annotation = Annotation(
            "[cwb.scramble_on]:misc.number_random"),
        token: Annotation = Annotation("<token>"),
        word: Annotation = Annotation("[export.word]"),
        annotations: ExportAnnotations = ExportAnnotations("cwb.annotations"),
        source_annotations: SourceAnnotations = SourceAnnotations(
            "cwb.source_annotations"),
        remove_namespaces: bool = Config("export.remove_module_namespaces",
                                         False),
        sparv_namespace: str = Config("export.sparv_namespace"),
        source_namespace: str = Config("export.source_namespace")):
    """Export annotations to vrt in scrambled order."""
    # Get annotation spans, annotations list etc.
    annotation_list, token_attributes, export_names = util.get_annotation_names(
        annotations,
        source_annotations,
        doc=doc,
        token_name=token.name,
        remove_namespaces=remove_namespaces,
        sparv_namespace=sparv_namespace,
        source_namespace=source_namespace)
    if chunk not in annotation_list:
        raise util.SparvErrorMessage(
            "The annotation used for scrambling ({}) needs to be included in the output."
            .format(chunk))
    span_positions, annotation_dict = util.gather_annotations(
        annotation_list, export_names, doc=doc, split_overlaps=True)

    # Read words and document ID
    word_annotation = list(word.read())
    chunk_order_data = list(chunk_order.read())

    # Reorder chunks and open/close tags in correct order
    new_span_positions = util.scramble_spans(span_positions, chunk.name,
                                             chunk_order_data)

    # Make vrt format
    vrt_data = create_vrt(new_span_positions, token.name, word_annotation,
                          token_attributes, annotation_dict, export_names)

    # Create export dir
    os.makedirs(os.path.dirname(out), exist_ok=True)

    # Write result to file
    with open(out, "w") as f:
        f.write(vrt_data)
    log.info("Exported: %s", out)
示例#4
0
def contextual(out: Output = Output("{chunk}:geo.geo_context", description="Geographical places with coordinates"),
               chunk: Annotation = Annotation("{chunk}"),
               context: Annotation = Annotation("[geo.context_chunk]"),
               ne_type: Annotation = Annotation("swener.ne:swener.type"),
               ne_subtype: Annotation = Annotation("swener.ne:swener.subtype"),
               ne_name: Annotation = Annotation("swener.ne:swener.name"),
               model: Model = Model("[geo.model]"),
               method: str = "populous",
               language: list = []):
    """Annotate chunks with location data, based on locations contained within the text.

    context = text chunk to use for disambiguating places (when applicable).
    chunk = text chunk to which the annotation will be added.
    """
    model = load_model(model, language=language)

    ne_type_annotation = list(ne_type.read())
    ne_subtype_annotation = list(ne_subtype.read())
    ne_name_annotation = list(ne_name.read())

    children_context_chunk, _orphans = context.get_children(chunk)
    children_chunk_ne, _orphans = chunk.get_children(ne_type)

    out_annotation = chunk.create_empty_attribute()

    for chunks in children_context_chunk:
        all_locations = []  # TODO: Maybe not needed for anything?
        context_locations = []
        chunk_locations = defaultdict(list)

        for ch in chunks:
            for n in children_chunk_ne[ch]:
                if ne_type_annotation[n] == "LOC" and "PPL" in ne_subtype_annotation[n]:
                    location_text = ne_name_annotation[n].replace("\n", " ").replace("  ", " ")
                    location_data = model.get(location_text.lower())
                    if location_data:
                        all_locations.append((location_text, list(location_data)))
                        context_locations.append((location_text, list(location_data)))
                        chunk_locations[ch].append((location_text, list(location_data)))
                    else:
                        pass
                        # log.info("No location found for %s" % ne_name_annotation[n].replace("%", "%%"))

        chunk_locations = most_populous(chunk_locations)

        for c in chunks:
            out_annotation[c] = _format_location(chunk_locations.get(c, ()))

    out.write(out_annotation)
示例#5
0
def ufeatstag(out: Output = Output(
    "<token>:misc.ufeats",
    cls="token:ufeats",
    description="Universal morphological features"),
              pos: Annotation = Annotation("<token:pos>"),
              msd: Annotation = Annotation("<token:msd>")):
    """Convert SUC MSD tags to universal features."""
    pos_tags = pos.read()
    msd_tags = msd.read()
    out_annotation = []

    for pos_tag, msd_tag in zip(pos_tags, msd_tags):
        feats = util.tagsets.suc_to_feats(pos_tag, msd_tag)
        out_annotation.append(util.cwbset(feats))

    out.write(out_annotation)
def uppercase(
    word: Annotation = Annotation("<token:word>"),
    out: Output = Output("<token>:uppercase.upper"),
    # some_config_variable: str = Config("uppercase.some_setting")
):
    """Convert to uppercase."""
    out.write([val.upper() for val in word.read()])
示例#7
0
def diapivot_annotate(
        out: Output = Output(
            "<token>:hist.diapivot",
            description="SALDO IDs corresponding to lemgrams"),
        lemgram: Annotation = Annotation("<token>:saldo.lemgram"),
        model: Model = Model("hist/diapivot.pickle")):
    """Annotate each lemgram with its corresponding saldo_id according to model.

    Args:
        out (str, optional): Resulting annotation file.
            Defaults to Output("<token>:hist.diapivot", description="SALDO IDs corresponding to lemgrams").
        lemgram (str, optional): Existing lemgram annotation. Defaults to Annotation("<token>:saldo.lemgram").
        model (str, optional): Crosslink model. Defaults to Model("hist/diapivot.pickle").
    """
    lexicon = PivotLexicon(model)
    lemgram_annotation = list(lemgram.read())

    out_annotation = []

    for lemgrams in lemgram_annotation:
        saldo_ids = []
        for lemgram in lemgrams.split(util.DELIM):
            s_i = lexicon.get_exactMatch(lemgram)
            if s_i:
                saldo_ids += [s_i]
        out_annotation.append(util.AFFIX + util.DELIM.join(set(saldo_ids)) +
                              util.AFFIX if saldo_ids else util.AFFIX)

    out.write(out_annotation)
示例#8
0
def translate_tag(out: Output, tag: Annotation, mapping: dict = {}):
    """Convert part-of-speech tags, specified by the mapping.

    Example mappings: parole_to_suc, suc_to_simple, ...
    """
    if isinstance(mapping, str):
        mapping = util.tagsets.mappings[mapping]
    out.write((mapping.get(t, t) for t in tag.read()))
示例#9
0
def find_replace_regex(chunk: Annotation,
                       out: Output,
                       find: str = "",
                       sub: str = ""):
    """Do find and replace in values of annotation using a regular expressions.

    N.B: When writing regular expressions in YAML they should be enclosed in single quotes.
    """
    out.write((re.sub(find, sub, val) for val in chunk.read()))
示例#10
0
def pretty(doc: Document = Document(),
           docid: AnnotationData = AnnotationData("<docid>"),
           out: Export = Export("xml_pretty/[xml_export.filename]"),
           token: Annotation = Annotation("<token>"),
           word: Annotation = Annotation("[export.word]"),
           annotations: ExportAnnotations = ExportAnnotations("xml_export.annotations"),
           source_annotations: SourceAnnotations = SourceAnnotations("xml_export.source_annotations"),
           header_annotations: SourceAnnotations = SourceAnnotations("xml_export.header_annotations"),
           remove_namespaces: bool = Config("export.remove_module_namespaces", False),
           sparv_namespace: str = Config("export.sparv_namespace"),
           source_namespace: str = Config("export.source_namespace"),
           include_empty_attributes: bool = Config("xml_export.include_empty_attributes")):
    """Export annotations to pretty XML in export_dir.

    Args:
        doc: Name of the original document.
        docid: Annotation with document IDs.
        out: Path and filename pattern for resulting file.
        token: Annotation containing the token strings.
        word: Annotation containing the token strings.
        annotations: List of elements:attributes (annotations) to include.
        source_annotations: List of elements:attributes from the original document
            to be kept. If not specified, everything will be kept.
        header_annotations: List of header elements from the original document to include
            in the export. If not specified, all headers will be kept.
        remove_namespaces: Whether to remove module "namespaces" from element and attribute names.
            Disabled by default.
        sparv_namespace: The namespace to be added to all Sparv annotations.
        source_namespace: The namespace to be added to all annotations present in the source.
        include_empty_attributes: Whether to include attributes even when they are empty. Disabled by default.
    """
    # Create export dir
    os.makedirs(os.path.dirname(out), exist_ok=True)

    token_name = token.name

    # Read words and document ID
    word_annotation = list(word.read())
    docid_annotation = docid.read()

    # Get annotation spans, annotations list etc.
    annotation_list, _, export_names = util.get_annotation_names(annotations, source_annotations, doc=doc,
                                                                 token_name=token_name,
                                                                 remove_namespaces=remove_namespaces,
                                                                 sparv_namespace=sparv_namespace,
                                                                 source_namespace=source_namespace)
    h_annotations, h_export_names = util.get_header_names(header_annotations, doc=doc)
    export_names.update(h_export_names)
    span_positions, annotation_dict = util.gather_annotations(annotation_list, export_names, h_annotations,
                                                              doc=doc, split_overlaps=True)
    xmlstr = xml_utils.make_pretty_xml(span_positions, annotation_dict, export_names, token_name, word_annotation,
                                       docid_annotation, include_empty_attributes, sparv_namespace)

    # Write XML to file
    with open(out, mode="w") as outfile:
        outfile.write(xmlstr)
    log.info("Exported: %s", out)
示例#11
0
def msdtag(out: Output = Output(
    "<token>:hunpos.msd",
    cls="token:msd",
    description="Part-of-speeches with morphological descriptions"),
           word: Annotation = Annotation("<token:word>"),
           sentence: Annotation = Annotation("<sentence>"),
           binary: Binary = Binary("[hunpos.binary]"),
           model: Model = Model("[hunpos.model]"),
           morphtable: Optional[Model] = Model("[hunpos.morphtable]"),
           patterns: Optional[Model] = Model("[hunpos.patterns]"),
           tag_mapping=None,
           encoding: str = util.UTF8):
    """POS/MSD tag using the Hunpos tagger."""
    if isinstance(tag_mapping, str) and tag_mapping:
        tag_mapping = util.tagsets.mappings[tag_mapping]
    elif tag_mapping is None or tag_mapping == "":
        tag_mapping = {}

    pattern_list = []

    if patterns:
        with open(patterns.path, encoding="utf-8") as pat:
            for line in pat:
                if line.strip() and not line.startswith("#"):
                    name, pattern, tags = line.strip().split("\t", 2)
                    pattern_list.append(
                        (name, re.compile("^%s$" % pattern), tags))

    def replace_word(w):
        """Replace word with alias if word matches a regex pattern."""
        for p in pattern_list:
            if re.match(p[1], w):
                return "[[%s]]" % p[0]
        return w

    sentences, _orphans = sentence.get_children(word)
    token_word = list(word.read())
    stdin = SENT_SEP.join(
        TOK_SEP.join(
            replace_word(token_word[token_index]) for token_index in sent)
        for sent in sentences)
    args = [model.path]
    if morphtable:
        args.extend(["-m", morphtable.path])
    stdout, _ = util.system.call_binary(binary, args, stdin, encoding=encoding)

    out_annotation = word.create_empty_attribute()
    for sent, tagged_sent in zip(sentences, stdout.strip().split(SENT_SEP)):
        for token_index, tagged_token in zip(
                sent,
                tagged_sent.strip().split(TOK_SEP)):
            tag = tagged_token.strip().split(TAG_SEP)[TAG_COLUMN]
            tag = tag_mapping.get(tag, tag)
            out_annotation[token_index] = tag

    out.write(out_annotation)
示例#12
0
def struct_to_token(
        attr: Annotation = Annotation("{struct}:{attr}"),
        token: Annotation = Annotation("<token>"),
        out: Output = Output("<token>:misc.from_struct_{struct}_{attr}")):
    """Convert an attribute on a structural annotation into a token attribute."""
    token_parents = token.get_parents(attr)
    attr_values = list(attr.read())
    out_values = [
        attr_values[p] if p is not None else "" for p in token_parents
    ]
    out.write(out_values)
示例#13
0
def metadata(out: Output = Output("{chunk}:geo.geo_metadata", description="Geographical places with coordinates"),
             chunk: Annotation = Annotation("{chunk}"),
             source: Annotation = Annotation("[geo.metadata_source]"),
             model: Model = Model("[geo.model]"),
             method: str = "populous",
             language: list = []):
    """Get location data based on metadata containing location names."""
    geomodel = load_model(model, language=language)

    same_target_source = chunk.split()[0] == source.split()[0]
    chunk_annotation = list(chunk.read())
    source_annotation = list(source.read())

    # If location source and target chunk are not the same, we need
    # to find the parent/child relations between them.
    if not same_target_source:
        target_source_parents = list(source.get_parents(chunk))

    chunk_locations = {}

    for i, _ in enumerate(chunk_annotation):
        if same_target_source:
            location_source = source_annotation[i]
        else:
            location_source = source_annotation[target_source_parents[i]] if target_source_parents[
                i] is not None else None

        if location_source:
            location_data = geomodel.get(location_source.strip().lower())
            if location_data:
                chunk_locations[i] = [(location_source, list(location_data))]
        else:
            chunk_locations[i] = []

    chunk_locations = most_populous(chunk_locations)

    out_annotation = chunk.create_empty_attribute()
    for c in chunk_locations:
        out_annotation[c] = _format_location(chunk_locations.get(c, ()))

    out.write(out_annotation)
示例#14
0
def upostag(out: Output = Output("<token>:misc.upos",
                                 cls="token:upos",
                                 description="Part-of-speeches in UD"),
            pos: Annotation = Annotation("<token:pos>")):
    """Convert SUC POS tags to UPOS."""
    pos_tags = pos.read()
    out_annotation = []

    for tag in pos_tags:
        out_annotation.append(util.tagsets.pos_to_upos(tag, "swe", "SUC"))

    out.write(out_annotation)
示例#15
0
def select(out: Output,
           annotation: Annotation,
           index: Optional[int] = 0,
           separator: Optional[str] = " "):
    """Select a specific index from the values of an annotation.

    The given annotation values are separated by 'separator',
    by default whitespace, with at least index + 1 elements.
    """
    if isinstance(index, str):
        index = int(index)
    out.write(value.split(separator)[index] for value in annotation.read())
示例#16
0
def annotate(
        lang: Language = Language(),
        model: Model = Model("[treetagger.model]"),
        tt_binary: Binary = Binary("[treetagger.binary]"),
        out_upos: Output = Output("<token>:treetagger.upos",
                                  cls="token:upos",
                                  description="Part-of-speeches in UD"),
        out_pos: Output = Output(
            "<token>:treetagger.pos",
            cls="token:pos",
            description="Part-of-speeches from TreeTagger"),
        out_baseform: Output = Output("<token>:treetagger.baseform",
                                      description="Baseforms from TreeTagger"),
        word: Annotation = Annotation("<token:word>"),
        sentence: Annotation = Annotation("<sentence>"),
        encoding: str = util.UTF8):
    """POS/MSD tag and lemmatize using TreeTagger."""
    sentences, _orphans = sentence.get_children(word)
    word_annotation = list(word.read())
    stdin = SENT_SEP.join(
        TOK_SEP.join(word_annotation[token_index] for token_index in sent)
        for sent in sentences)
    args = ["-token", "-lemma", "-no-unknown", "-eos-tag", "<eos>", model.path]

    stdout, stderr = util.system.call_binary(tt_binary,
                                             args,
                                             stdin,
                                             encoding=encoding)
    log.debug("Message from TreeTagger:\n%s", stderr)

    # Write pos and upos annotations.
    out_upos_annotation = word.create_empty_attribute()
    out_pos_annotation = word.create_empty_attribute()
    for sent, tagged_sent in zip(sentences, stdout.strip().split(SENT_SEP)):
        for token_id, tagged_token in zip(sent,
                                          tagged_sent.strip().split(TOK_SEP)):
            tag = tagged_token.strip().split(TAG_SEP)[TAG_COLUMN]
            out_pos_annotation[token_id] = tag
            out_upos_annotation[token_id] = util.tagsets.pos_to_upos(
                tag, lang, TAG_SETS.get(lang))
    out_pos.write(out_pos_annotation)
    out_upos.write(out_upos_annotation)

    # Write lemma annotations.
    out_lemma_annotation = word.create_empty_attribute()
    for sent, tagged_sent in zip(sentences, stdout.strip().split(SENT_SEP)):
        for token_id, tagged_token in zip(sent,
                                          tagged_sent.strip().split(TOK_SEP)):
            lem = tagged_token.strip().split(TAG_SEP)[LEM_COLUMN]
            out_lemma_annotation[token_id] = lem
    out_baseform.write(out_lemma_annotation)
示例#17
0
def annotate(
        sense: Annotation = Annotation("<token>:saldo.sense"),
        out_scores: Output = Output("<token>:sensaldo.sentiment_score",
                                    description="SenSALDO sentiment score"),
        out_labels: Output = Output("<token>:sensaldo.sentiment_label",
                                    description="SenSALDO sentiment label"),
        model: Model = Model("[sensaldo.model]"),
        lexicon=None):
    """Assign sentiment values to tokens based on their sense annotation.

    When more than one sense is possible, calulate a weighted mean.
    - sense: existing annotation with saldoIDs.
    - out_scores, out_labels: resulting annotation file.
    - model: pickled lexicon with saldoIDs as keys.
    - lexicon: this argument cannot be set from the command line,
      but is used in the catapult. This argument must be last.
    """
    if not lexicon:
        lexicon = util.PickledLexicon(model.path)
    # Otherwise use pre-loaded lexicon (from catapult)

    sense = sense.read()
    result_scores = []
    result_labels = []

    for token in sense:
        # Get set of senses for each token and sort them according to their probabilities
        token_senses = [
            tuple(s.rsplit(util.SCORESEP, 1)) if util.SCORESEP in s else
            (s, -1.0) for s in token.split(util.DELIM) if s
        ]
        token_senses.sort(key=lambda x: float(x[1]), reverse=True)

        # Lookup the sentiment score for the most probable sense and assign a sentiment label
        if token_senses:
            best_sense = token_senses[0][0]
            score = lexicon.lookup(best_sense, None)
        else:
            score = None

        if score:
            result_scores.append(score)
            result_labels.append(SENTIMENT_LABLES.get(int(score)))
        else:
            result_scores.append(None)
            result_labels.append(None)

    out_scores.write(result_scores)
    out_labels.write(result_labels)
示例#18
0
def replace_list(chunk: Annotation,
                 out: Output,
                 find: str = "",
                 sub: str = ""):
    """Find and replace annotations.

    Find string must match whole annotation.
    find and sub are whitespace separated lists of words to replace and their replacement.
    """
    find = find.split()
    sub = sub.split()
    if len(find) != len(sub):
        raise util.SparvErrorMessage(
            "Find and sub must have the same number of words.")
    translate = dict((f, s) for (f, s) in zip(find, sub))
    out.write((translate.get(val, val) for val in chunk.read()))
示例#19
0
def number_by_parent(out: Output = Output("{annotation}:misc.number_by_parent_{parent_annotation}__{parent_attribute}"),
                     chunk: Annotation = Annotation("{annotation}"),
                     parent_order: Annotation = Annotation("{parent_annotation}:{parent_attribute}"),
                     prefix: str = "",
                     zfill: bool = False,
                     start: int = START_DEFAULT):
    """Number chunks by (parent_order, chunk order)."""
    parent_children, _orphans = parent_order.get_children(chunk)

    child_order = {child_index: (parent_nr, child_index)
                   for parent_index, parent_nr in enumerate(parent_order.read())
                   for child_index in parent_children[parent_index]}

    def _order(index, _value):
        return child_order.get(index)

    _read_chunks_and_write_new_ordering(out, chunk, _order, prefix, zfill, start)
示例#20
0
def ids(doc: Document = Document(),
        annotation: Annotation = Annotation("{annotation}"),
        out: Output = Output("{annotation}:misc.id",
                             description="Unique ID for {annotation}"),
        docid: AnnotationData = AnnotationData("<docid>"),
        prefix: str = ""):
    """Create unique IDs for every span of an existing annotation."""
    docid = docid.read()
    prefix = prefix + docid

    ann = list(annotation.read())
    out_annotation = []
    # Use doc name and annotation name as seed for the IDs
    _reset_id("{}/{}".format(doc, annotation), len(ann))
    for _ in ann:
        new_id = _make_id(prefix, out_annotation)
        out_annotation.append(new_id)
    out.write(out_annotation)
示例#21
0
def nominal_ratio(text: Annotation = Annotation("<text>"),
                  pos: Annotation = Annotation("<token:pos>"),
                  out: Output = Output(
                      "<text>:readability.nk",
                      description="Nominal ratios for text chunks"),
                  noun_pos: List[str] = ["NN", "PP", "PC"],
                  verb_pos: List[str] = ["PN", "AB", "VB"],
                  fmt: str = "%.2f"):
    """Create nominal ratio annotation for text."""
    text_children, _orphans = text.get_children(pos)
    pos_annotation = list(pos.read())

    # Calculate OVIX for every text element
    nk_annotation = []
    for text in text_children:
        in_pos = [pos_annotation[token_index] for token_index in text]
        nk_annotation.append(fmt %
                             nominal_ratio_calc(in_pos, noun_pos, verb_pos))
    out.write(nk_annotation)
示例#22
0
def vrt(doc: Document = Document(),
        out: Export = Export("vrt/{doc}.vrt"),
        token: Annotation = Annotation("<token>"),
        word: Annotation = Annotation("[export.word]"),
        annotations: ExportAnnotations = ExportAnnotations("cwb.annotations"),
        source_annotations: SourceAnnotations = SourceAnnotations(
            "cwb.source_annotations"),
        remove_namespaces: bool = Config("export.remove_module_namespaces",
                                         False),
        sparv_namespace: str = Config("export.sparv_namespace"),
        source_namespace: str = Config("export.source_namespace")):
    """Export annotations to vrt.

    - annotations: list of elements:attributes (annotations) to include.
    - source_annotations: list of elements:attributes from the original document
      to be kept. If not specified, everything will be kept.
    """
    # Create export dir
    os.makedirs(os.path.dirname(out), exist_ok=True)

    # Read words
    word_annotation = list(word.read())

    # Get annotation spans, annotations list etc.
    annotation_list, token_attributes, export_names = util.get_annotation_names(
        annotations,
        source_annotations,
        doc=doc,
        token_name=token.name,
        remove_namespaces=remove_namespaces,
        sparv_namespace=sparv_namespace,
        source_namespace=source_namespace)
    span_positions, annotation_dict = util.gather_annotations(annotation_list,
                                                              export_names,
                                                              doc=doc)
    vrt_data = create_vrt(span_positions, token.name, word_annotation,
                          token_attributes, annotation_dict, export_names)

    # Write result to file
    with open(out, "w") as f:
        f.write(vrt_data)
    log.info("Exported: %s", out)
示例#23
0
def _read_chunks_and_write_new_ordering(out: Output, chunk: Annotation, order, prefix="", zfill=False,
                                        start=START_DEFAULT):
    """Common function called by other numbering functions."""
    new_order = defaultdict(list)

    in_annotation = list(chunk.read())

    for i, val in enumerate(in_annotation):
        val = order(i, val)
        new_order[val].append(i)

    out_annotation = chunk.create_empty_attribute()

    nr_digits = len(str(len(new_order) - 1 + start))
    for nr, key in enumerate(sorted(new_order), start):
        for index in new_order[key]:
            out_annotation[index] = "{prefix}{nr:0{length}d}".format(prefix=prefix,
                                                                     length=nr_digits if zfill else 0,
                                                                     nr=nr)

    out.write(out_annotation)
示例#24
0
def text_headtail(text: Text = Text(),
                  chunk: Annotation = Annotation("<token>"),
                  out_head: Output = Output("<token>:misc.head"),
                  out_tail: Output = Output("<token>:misc.tail")):
    """Extract "head" and "tail" whitespace characters for tokens."""
    def escape(t):
        """Escape whitespace characters."""
        return t.replace(" ", "\\s").replace("\n", "\\n").replace("\t", "\\t")

    out_head_annotation = chunk.create_empty_attribute()
    out_tail_annotation = chunk.create_empty_attribute()
    head_text = None

    corpus_text = text.read()
    chunk = list(chunk.read())

    for i, span in enumerate(chunk):
        if head_text:
            out_head_annotation[i] = escape(head_text)
            head_text = None

        if i < len(chunk) - 1:
            tail_start = span[1][0]
            tail_end = chunk[i + 1][0][0]
            tail_text = corpus_text[tail_start:tail_end]

            try:
                n_pos = tail_text.rindex("\n")
            except ValueError:
                n_pos = None
            if n_pos is not None and n_pos + 1 < len(tail_text):
                head_text = tail_text[n_pos + 1:]
                tail_text = tail_text[:n_pos + 1]

            if tail_text:
                out_tail_annotation[i] = escape(tail_text)

    out_head.write(out_head_annotation)
    out_tail.write(out_tail_annotation)
示例#25
0
def annotate(
        maltjar: Binary = Binary("[malt.jar]"),
        model: Model = Model("[malt.model]"),
        out_dephead: Output = Output(
            "<token>:malt.dephead",
            cls="token:dephead",
            description="Positions of the dependency heads"),
        out_dephead_ref: Output = Output(
            "<token>:malt.dephead_ref",
            cls="token:dephead_ref",
            description="Sentence-relative positions of the dependency heads"),
        out_deprel: Output = Output(
            "<token>:malt.deprel",
            cls="token:deprel",
            description="Dependency relations to the head"),
        word: Annotation = Annotation("<token:word>"),
        pos: Annotation = Annotation("<token:pos>"),
        msd: Annotation = Annotation("<token:msd>"),
        ref: Annotation = Annotation("<token>:misc.number_rel_<sentence>"),
        sentence: Annotation = Annotation("<sentence>"),
        token: Annotation = Annotation("<token>"),
        encoding: str = util.UTF8,
        process_dict=None):
    """
    Run the malt parser, in an already started process defined in process_dict, or start a new process (default).

    The process_dict argument should never be set from the command line.
    """
    if process_dict is None:
        process = maltstart(maltjar, model, encoding)
    else:
        process = process_dict["process"]
        # If process seems dead, spawn a new
        if process.stdin.closed or process.stdout.closed or process.poll():
            util.system.kill_process(process)
            process = maltstart(maltjar,
                                model,
                                encoding,
                                send_empty_sentence=True)
            process_dict["process"] = process

    sentences, orphans = sentence.get_children(token)
    sentences.append(orphans)

    word_annotation = list(word.read())
    pos_annotation = list(pos.read())
    msd_annotation = list(msd.read())
    ref_annotation = list(ref.read())

    def conll_token(nr, token_index):
        form = word_annotation[token_index]
        lemma = UNDEF
        pos = cpos = pos_annotation[token_index]
        feats = re.sub(r"[ ,.]", "|",
                       msd_annotation[token_index]).replace("+", "/")
        return TAG_SEP.join((str(nr), form, lemma, cpos, pos, feats))

    stdin = SENT_SEP.join(
        TOK_SEP.join(
            conll_token(n + 1, token_index)
            for n, token_index in enumerate(sent)) for sent in sentences)

    if encoding:
        stdin = stdin.encode(encoding)

    keep_process = len(
        stdin) < RESTART_THRESHOLD_LENGTH and process_dict is not None
    log.info("Stdin length: %s, keep process: %s", len(stdin), keep_process)

    if process_dict is not None:
        process_dict["restart"] = not keep_process

    if keep_process:
        # Chatting with malt: send a SENT_SEP and read correct number of lines
        stdin_fd, stdout_fd = process.stdin, process.stdout
        stdin_fd.write(stdin + SENT_SEP.encode(util.UTF8))
        stdin_fd.flush()

        malt_sentences = []
        for sent in sentences:
            malt_sent = []
            for _ in sent:
                line = stdout_fd.readline()
                if encoding:
                    line = line.decode(encoding)
                malt_sent.append(line)
            line = stdout_fd.readline()
            assert line == b"\n"
            malt_sentences.append(malt_sent)
    else:
        # Otherwise use communicate which buffers properly
        stdout, _ = process.communicate(stdin)
        if encoding:
            stdout = stdout.decode(encoding)
        malt_sentences = (malt_sent.split(TOK_SEP)
                          for malt_sent in stdout.split(SENT_SEP))

    out_dephead_annotation = word.create_empty_attribute()
    out_dephead_ref_annotation = out_dephead_annotation.copy()
    out_deprel_annotation = out_dephead_annotation.copy()
    for (sent, malt_sent) in zip(sentences, malt_sentences):
        for (token_index, malt_tok) in zip(sent, malt_sent):
            cols = [(None if col == UNDEF else col)
                    for col in malt_tok.split(TAG_SEP)]
            out_deprel_annotation[token_index] = cols[DEPREL_COLUMN]
            head = int(cols[HEAD_COLUMN])
            out_dephead_annotation[token_index] = str(sent[head -
                                                           1]) if head else "-"
            out_dephead_ref_annotation[token_index] = str(
                ref_annotation[sent[head - 1]]) if head else ""

    out_dephead.write(out_dephead_annotation)
    out_dephead_ref.write(out_dephead_ref_annotation)
    out_deprel.write(out_deprel_annotation)
示例#26
0
def annotate_words(out: Output, model: Model, saldoids: Annotation, pos: Annotation, annotate, pos_limit: List[str],
                   class_set=None, disambiguate=True, connect_ids=False, delimiter=util.DELIM, affix=util.AFFIX,
                   scoresep=util.SCORESEP, lexicon=None):
    """
    Annotate words with blingbring classes (rogetID).

    - out_sent: resulting annotation file.
    - model: pickled lexicon with saldoIDs as keys.
    - saldoids, pos: existing annotation with saldoIDs/parts of speech.
    - annotate: annotation function, returns an iterable containing annotations
        for one token ID. (annotate_bring() or annotate_swefn())
    - pos_limit: parts of speech that will be annotated.
        Set to None to annotate all pos.
    - class_set: output Bring classes or Roget IDs ("bring", "roget_head",
        "roget_subsection", "roget_section" or "roget_class").
        Set to None when not annotating blingbring.
    - disambiguate: use WSD and use only the most likely saldo ID.
    - connect_IDs: for sweFN: paste saldo ID after each sweFN ID.
    - delimiter: delimiter character to put between ambiguous results
    - affix: optional character to put before and after results to mark a set.
    - lexicon: this argument cannot be set from the command line,
      but is used in the catapult. This argument must be last.
    """
    if not lexicon:
        lexicon = util.PickledLexicon(model.path)
    # Otherwise use pre-loaded lexicon (from catapult)

    sense = saldoids.read()
    token_pos = list(pos.read())
    out_annotation = pos.create_empty_attribute()

    # Check if the saldo IDs are ranked (= word senses have been disambiguated)
    wsd = saldoids.split()[1].split(".")[0] == "wsd"

    for token_index, token_sense in enumerate(sense):

        # Check if part of speech of this token is allowed
        if not pos_ok(token_pos, token_index, pos_limit):
            saldo_ids = None
            out_annotation[token_index] = affix
            continue

        if wsd and util.SCORESEP in token_sense:
            ranked_saldo = token_sense.strip(util.AFFIX).split(util.DELIM) \
                if token_sense != util.AFFIX else None
            saldo_tuples = [(i.split(util.SCORESEP)[0], i.split(util.SCORESEP)[1]) for i in ranked_saldo]

            if not disambiguate:
                saldo_ids = [i[0] for i in saldo_tuples]

            # Only take the most likely analysis into account.
            # Handle wsd with equal probability for several words
            else:
                saldo_ids = [saldo_tuples[0]]
                del saldo_tuples[0]
                while saldo_tuples and (saldo_tuples[0][1] == saldo_ids[0][1]):
                    saldo_ids = [saldo_tuples[0]]
                    del saldo_tuples[0]

                saldo_ids = [i[0] for i in saldo_ids]

        else:  # No WSD
            saldo_ids = token_sense.strip(util.AFFIX).split(util.DELIM) \
                if token_sense != util.AFFIX else None

        result = annotate(saldo_ids, lexicon, connect_ids, scoresep)
        out_annotation[token_index] = util.cwbset(result, delimiter, affix) if result else affix
    out.write(out_annotation)
示例#27
0
def annotate(token: Annotation = Annotation("<token>"),
             word: Annotation = Annotation("<token:word>"),
             sentence: Annotation = Annotation("<sentence>"),
             reference: Annotation = Annotation(
                 "<token>:misc.number_rel_<sentence>"),
             out_sense: Output = Output("<token>:saldo.sense",
                                        cls="token:sense",
                                        description="SALDO identifier"),
             out_lemgram: Output = Output("<token>:saldo.lemgram",
                                          description="SALDO lemgram"),
             out_baseform: Output = Output("<token>:saldo.baseform",
                                           cls="token:baseform",
                                           description="Baseform from SALDO"),
             models: List[Model] = [Model("[saldo.model]")],
             msd: Optional[Annotation] = Annotation("<token:msd>"),
             delimiter: str = util.DELIM,
             affix: str = util.AFFIX,
             precision: str = Config("saldo.precision"),
             precision_filter: str = "max",
             min_precision: float = 0.66,
             skip_multiword: bool = False,
             allow_multiword_overlap: bool = False,
             word_separator: str = "",
             lexicons=None):
    """Use the Saldo lexicon model (and optionally other older lexicons) to annotate pos-tagged words.

    - token, word, msd, sentence, reference: existing annotations
    - out_baseform, out_lemgram, out_sense: resulting annotations to be written
    - models: a list of pickled lexica, typically the Saldo model (saldo.pickle)
      and optional lexicons for older Swedish.
    - delimiter: delimiter character to put between ambiguous results
    - affix: an optional character to put before and after results
    - precision: a format string for how to print the precision for each annotation, e.g. ":%.3f"
      (use empty string for no precision)
    - precision_filter: an optional filter, currently there are the following values:
        max: only use the annotations that are most probable
        first: only use the most probable annotation (or one of the most probable if more than one)
        none: use all annotations
    - min_precision: only use annotations with a probability score higher than this
    - skip_multiword: set to True to disable multi word annotations
    - allow_multiword_overlap: by default we do some cleanup among overlapping multi word annotations.
      By setting this to True, all overlaps will be allowed.
    - word_separator: an optional character used to split the values of "word" into several word variations
    - lexicons: this argument cannot be set from the command line, but is used in the catapult.
      This argument must be last.
    """
    # Allow use of multiple lexicons
    models_list = [(m.path.stem, m) for m in models]
    if not lexicons:
        lexicon_list = [(name, SaldoLexicon(lex.path))
                        for name, lex in models_list]
    # Use pre-loaded lexicons (from catapult)
    else:
        lexicon_list = []
        for name, _lex in models_list:
            assert lexicons.get(
                name, None) is not None, "Lexicon %s not found!" % name
            lexicon_list.append((name, lexicons[name]))

    # Maximum number of gaps in multi-word units.
    # TODO: Set to 0 for hist-mode? since many (most?) multi-word in the old lexicons are inseparable (half öre etc)
    max_gaps = 1

    # Combine annotation names i SALDO lexicon with out annotations
    annotations = []
    if out_baseform:
        annotations.append((out_baseform, "gf"))
    if out_lemgram:
        annotations.append((out_lemgram, "lem"))
    if out_sense:
        annotations.append((out_sense, "saldo"))

    if skip_multiword:
        log.info("Skipping multi word annotations")

    min_precision = float(min_precision)

    # If min_precision is 0, skip almost all part-of-speech checking (verb multi-word expressions still won't be
    # allowed to span over other verbs)
    skip_pos_check = (min_precision == 0.0)

    word_annotation = list(word.read())
    ref_annotation = list(reference.read())
    if msd:
        msd_annotation = list(msd.read())

    sentences, orphans = sentence.get_children(token)
    sentences.append(orphans)

    out_annotation = word.create_empty_attribute()

    for sent in sentences:
        incomplete_multis = [
        ]  # [{annotation, words, [ref], is_particle, lastwordWasGap, numberofgaps}]
        complete_multis = []  # ([ref], annotation)
        sentence_tokens = {}

        for token_index in sent:
            theword = word_annotation[token_index]
            ref = ref_annotation[token_index]
            msdtag = msd_annotation[token_index] if msd else ""

            annotation_info = {}
            sentence_tokens[ref] = {
                "token_index": token_index,
                "annotations": annotation_info
            }

            # Support for multiple values of word
            if word_separator:
                thewords = [w for w in theword.split(word_separator) if w]
            else:
                thewords = [theword]

            # First use MSD tags to find the most probable single word annotations
            ann_tags_words = find_single_word(thewords, lexicon_list, msdtag,
                                              precision, min_precision,
                                              precision_filter,
                                              annotation_info)

            # Find multi-word expressions
            if not skip_multiword:
                find_multiword_expressions(incomplete_multis, complete_multis,
                                           thewords, ref, msdtag, max_gaps,
                                           ann_tags_words, msd_annotation,
                                           sent, skip_pos_check)

            # Loop to next token

        if not allow_multiword_overlap:
            # Check that we don't have any unwanted overlaps
            remove_unwanted_overlaps(complete_multis)

        # Then save the rest of the multi word expressions in sentence_tokens
        save_multiwords(complete_multis, sentence_tokens)

        for tok in list(sentence_tokens.values()):
            out_annotation[tok["token_index"]] = _join_annotation(
                tok["annotations"], delimiter, affix)

        # Loop to next sentence

    for out_annotation_obj, annotation_name in annotations:
        out_annotation_obj.write(
            [v.get(annotation_name, delimiter) for v in out_annotation])
示例#28
0
def annotate(
        wsdjar: Binary = Binary("[wsd.jar]"),
        sense_model: Model = Model("[wsd.sense_model]"),
        context_model: Model = Model("[wsd.context_model]"),
        out: Output = Output(
            "<token>:wsd.sense",
            cls="token:sense",
            description="Sense disambiguated SALDO identifiers"),
        sentence: Annotation = Annotation("<sentence>"),
        word: Annotation = Annotation("<token:word>"),
        ref: Annotation = Annotation("<token>:misc.number_rel_<sentence>"),
        lemgram: Annotation = Annotation("<token>:saldo.lemgram"),
        saldo: Annotation = Annotation("<token>:saldo.sense"),
        pos: Annotation = Annotation("<token:pos>"),
        token: Annotation = Annotation("<token>"),
        prob_format: str = Config("wsd.prob_format"),
        default_prob: float = Config("wsd.default_prob"),
        encoding: str = util.UTF8):
    """Run the word sense disambiguation tool (saldowsd.jar) to add probabilities to the saldo annotation.

    Unanalyzed senses (e.g. multiword expressions) receive the probability value given by default_prob.
      - wsdjar is the name of the java programme to be used for the wsd
      - sense_model and context_model are the models to be used with wsdjar
      - out is the resulting annotation file
      - sentence is an existing annotation for sentences and their children (words)
      - word is an existing annotations for wordforms
      - ref is an existing annotation for word references
      - lemgram and saldo are existing annotations for inflection tables and meanings
      - pos is an existing annotations for part-of-speech
      - prob_format is a format string for how to print the sense probability
      - default_prob is the default value for unanalyzed senses
    """
    word_annotation = list(word.read())
    ref_annotation = list(ref.read())
    lemgram_annotation = list(lemgram.read())
    saldo_annotation = list(saldo.read())
    pos_annotation = list(pos.read())

    sentences, orphans = sentence.get_children(token)
    sentences.append(orphans)

    # Start WSD process
    process = wsd_start(wsdjar, sense_model.path, context_model.path, encoding)

    # Construct input and send to WSD
    stdin = build_input(sentences, word_annotation, ref_annotation,
                        lemgram_annotation, saldo_annotation, pos_annotation)

    if encoding:
        stdin = stdin.encode(encoding)

    stdout, stderr = process.communicate(stdin)
    # TODO: Solve hack line below!
    # Problem is that regular messages "Reading sense vectors.." are also piped to stderr.
    if len(stderr) > 52:
        util.system.kill_process(process)
        log.error(str(stderr))
        return

    if encoding:
        stdout = stdout.decode(encoding)

    process_output(word, out, stdout, sentences, saldo_annotation, prob_format,
                   default_prob)

    # Kill running subprocess
    util.system.kill_process(process)
    return
示例#29
0
def _formatter(in_from: Annotation, in_to: Optional[Annotation],
               out_from: Output, out_to: Output, informat: str, outformat: str,
               splitter: str, regex: str):
    """Take existing dates/times and input formats and convert to specified output format."""
    def get_smallest_unit(informat):
        smallest_unit = 0  # No date

        if "%y" not in informat and "%Y" not in informat:
            pass
        elif "%b" not in informat and "%B" not in informat and "%m" not in informat:
            smallest_unit = 1  # year
        elif "%d" not in informat:
            smallest_unit = 2  # month
        elif "%H" not in informat and "%I" not in informat:
            smallest_unit = 3  # day
        elif "%M" not in informat:
            smallest_unit = 4  # hour
        elif "%S" not in informat:
            smallest_unit = 5  # minute
        else:
            smallest_unit = 6  # second

        return smallest_unit

    def get_date_length(informat):
        parts = informat.split("%")
        length = len(
            parts[0])  # First value is either blank or not part of date

        lengths = {
            "Y": 4,
            "3Y": 3,
            "y": 2,
            "m": 2,
            "b": None,
            "B": None,
            "d": 2,
            "H": None,
            "I": None,
            "M": 2,
            "S": 2
        }

        for part in parts[1:]:
            add = lengths.get(part[0], None)
            if add:
                length += add + len(part[1:])
            else:
                return None

        return length

    if not in_to:
        in_to = in_from

    informat = informat.split("|")
    outformat = outformat.split("|")
    if splitter:
        splitter = splitter

    assert len(outformat) == 1 or (len(outformat) == len(informat)), "The number of out-formats must be equal to one " \
                                                                     "or the number of in-formats."

    ifrom = list(in_from.read())
    ofrom = in_from.create_empty_attribute()

    for index, val in enumerate(ifrom):
        val = val.strip()
        if not val:
            ofrom[index] = None
            continue

        tries = 0
        for inf in informat:
            if splitter and splitter in inf:
                values = re.findall("%[YybBmdHMS]", inf)
                if len(set(values)) < len(values):
                    vals = val.split(splitter)
                    inf = inf.split(splitter)
            else:
                vals = [val]
                inf = [inf]

            if regex:
                temp = []
                for v in vals:
                    matches = re.search(regex, v)
                    if matches:
                        temp.append([x for x in matches.groups() if x][0])
                if not temp:
                    # If the regex doesn't match, treat as no date
                    ofrom[index] = None
                    continue
                vals = temp

            tries += 1
            try:
                fromdates = []
                for i, v in enumerate(vals):
                    if "%3Y" in inf[i]:
                        datelen = get_date_length(inf[i])
                        if datelen and not datelen == len(v):
                            raise ValueError
                        inf[i] = inf[i].replace("%3Y", "%Y")
                        v = "0" + v
                    if "%0m" in inf[i] or "%0d" in inf[i]:
                        inf[i] = inf[i].replace("%0m",
                                                "%m").replace("%0d", "%d")
                        datelen = get_date_length(inf[i])
                        if datelen and not datelen == len(v):
                            raise ValueError
                    fromdates.append(datetime.datetime.strptime(v, inf[i]))
                if len(fromdates) == 1 or out_to:
                    ofrom[index] = fromdates[0].strftime(outformat[0] if len(
                        outformat) == 1 else outformat[tries - 1])
                else:
                    outstrings = [
                        fromdate.strftime(outformat[0] if len(outformat) ==
                                          1 else outformat[tries - 1])
                        for fromdate in fromdates
                    ]
                    ofrom[index] = outstrings[0] + splitter + outstrings[1]
                break
            except ValueError:
                if tries == len(informat):
                    log.error("Could not parse: %s", str(vals))
                    raise
                continue

    out_from.write(ofrom)
    del ofrom

    if out_to:
        ito = list(in_to.read())
        oto = in_to.create_empty_attribute()

        for index, val in enumerate(ito):
            if not val:
                oto[index] = None
                continue

            tries = 0
            for inf in informat:
                if splitter and splitter in inf:
                    values = re.findall("%[YybBmdHMS]", inf)
                    if len(set(values)) < len(values):
                        vals = val.split(splitter)
                        inf = inf.split(splitter)
                else:
                    vals = [val]
                    inf = [inf]

                if regex:
                    temp = []
                    for v in vals:
                        matches = re.search(regex, v)
                        if matches:
                            temp.append([x for x in matches.groups() if x][0])
                    if not temp:
                        # If the regex doesn't match, treat as no date
                        oto[index] = None
                        continue
                    vals = temp

                tries += 1
                try:
                    todates = []
                    for i, v in enumerate(vals):
                        if "%3Y" in inf[i]:
                            datelen = get_date_length(inf[i])
                            if datelen and not datelen == len(v):
                                raise ValueError
                            inf[i] = inf[i].replace("%3Y", "%Y")
                            v = "0" + v
                        if "%0m" in inf[i] or "%0d" in inf[i]:
                            inf[i] = inf[i].replace("%0m",
                                                    "%m").replace("%0d", "%d")
                            datelen = get_date_length(inf[i])
                            if datelen and not datelen == len(v):
                                raise ValueError
                        todates.append(datetime.datetime.strptime(v, inf[i]))
                    smallest_unit = get_smallest_unit(inf[0])
                    if smallest_unit == 1:
                        add = relativedelta(years=1)
                    elif smallest_unit == 2:
                        add = relativedelta(months=1)
                    elif smallest_unit == 3:
                        add = relativedelta(days=1)
                    elif smallest_unit == 4:
                        add = relativedelta(hours=1)
                    elif smallest_unit == 5:
                        add = relativedelta(minutes=1)
                    elif smallest_unit == 6:
                        add = relativedelta(seconds=1)

                    todates = [
                        todate + add - relativedelta(seconds=1)
                        for todate in todates
                    ]
                    oto[index] = todates[-1].strftime(outformat[0] if len(
                        outformat) == 1 else outformat[tries - 1])
                    break
                except ValueError:
                    if tries == len(informat):
                        log.error("Could not parse: %s", str(vals))
                        raise
                    continue

        out_to.write(oto)
示例#30
0
def csv(doc: Document = Document(),
        out: Export = Export("csv/{doc}.csv"),
        token: Annotation = Annotation("<token>"),
        word: Annotation = Annotation("[export.word]"),
        sentence: Annotation = Annotation("<sentence>"),
        annotations: ExportAnnotations = ExportAnnotations(
            "csv_export.annotations"),
        source_annotations: SourceAnnotations = SourceAnnotations(
            "csv_export.source_annotations"),
        remove_namespaces: bool = Config("export.remove_module_namespaces",
                                         False),
        sparv_namespace: str = Config("export.sparv_namespace"),
        source_namespace: str = Config("export.source_namespace"),
        delimiter: str = Config("csv_export.delimiter")):
    """Export annotations to CSV format."""
    # Create export dir
    os.makedirs(os.path.dirname(out), exist_ok=True)

    token_name = token.name

    # Read words
    word_annotation = list(word.read())

    # Get annotation spans, annotations list etc.
    annotation_list, token_attributes, export_names = util.get_annotation_names(
        annotations,
        source_annotations,
        doc=doc,
        token_name=token_name,
        remove_namespaces=remove_namespaces,
        sparv_namespace=sparv_namespace,
        source_namespace=source_namespace)
    span_positions, annotation_dict = util.gather_annotations(annotation_list,
                                                              export_names,
                                                              doc=doc)

    # Make csv header
    csv_data = [
        _make_header(token_name, token_attributes, export_names, delimiter)
    ]

    # Go through spans_dict and add to csv, line by line
    for _pos, instruction, span in span_positions:
        if instruction == "open":
            # Create token line
            if span.name == token_name:
                csv_data.append(
                    _make_token_line(word_annotation[span.index], token_name,
                                     token_attributes, annotation_dict,
                                     span.index, delimiter))

            # Create line with structural annotation
            else:
                attrs = _make_attrs(span.name, annotation_dict, export_names,
                                    span.index)
                for attr in attrs:
                    csv_data.append(f"# {attr}")
                if not attrs:
                    csv_data.append(f"# {span.export}")

        # Insert blank line after each closing sentence
        elif span.name == sentence.name and instruction == "close":
            csv_data.append("")

    # Write result to file
    with open(out, "w") as f:
        f.write("\n".join(csv_data))
    logger.info("Exported: %s", out)