예제 #1
0
def add(out, fileids, files=None, filelist=None, prefix=""):
    """ Adds IDs for new files to an existing list of file IDs, and removes missing ones. """

    assert files or filelist, "files or filelist must be specified"

    if filelist:
        with open(filelist, "r") as f:
            files = f.read().strip()

    files = files.split()
    files.sort()

    OUT = util.read_annotation(fileids)
    numfiles = (len(files) + len(OUT)) * 2

    # Add new files
    for f in files:
        if f not in OUT:
            util.resetIdent(f, numfiles)
            OUT[f] = prefix + util.mkIdent("", list(OUT.values()))
            util.log.info("File %s added.", f)

    # Remove deleted files
    todelete = []
    for f in OUT:
        if f not in files:
            todelete.append(f)
            util.log.info("File %s removed.", f)

    for f in todelete:
        del OUT[f]

    util.write_annotation(out, OUT)
def annotate(out_prefix,
             out_suffix,
             word,
             msd,
             model,
             delimiter="|",
             affix="|",
             lexicon=None):
    """Divides compound words into prefix and suffix.
    - out_prefix is the resulting annotation file for prefixes
    - out_suffix is the resulting annotation file for suffixes
    - word and msd are existing annotations for wordforms and MSDs
    - model is the Saldo compound model
    - lexicon: this argument cannot be set from the command line,
      but is used in the catapult. this argument must be last
    """

    if not lexicon:
        lexicon = SaldoLexicon(model)

    WORD = util.read_annotation(word)
    MSD = util.read_annotation(msd)

    OUT_p = {}
    OUT_s = {}

    for tokid in WORD:
        compounds = compound(lexicon, WORD[tokid], MSD[tokid])
        OUT_p[tokid] = affix + delimiter.join(set(
            c[0][1] for c in compounds)) + affix if compounds else affix
        OUT_s[tokid] = affix + delimiter.join(set(
            c[1][1] for c in compounds)) + affix if compounds else affix

    util.write_annotation(out_prefix, OUT_p)
    util.write_annotation(out_suffix, OUT_s)
예제 #3
0
def process_output(out, stdout, in_sentences, SALDO, sensefmt, default_prob):
    """Parse WSD output and write annotation."""
    OUT = {}

    # Split output into sentences
    out_sentences = stdout.strip()
    out_sentences = out_sentences.split("\t".join(["_", "_", "_", "_", SENT_SEP, "_", "_"]))
    out_sentences = [i for i in out_sentences if i]

    # Split output into tokens
    for out_sent, in_sent in zip(out_sentences, in_sentences):
        out_tokens = [t for t in out_sent.split("\n") if t]
        for (out_tok, in_tok) in zip(out_tokens, in_sent):
            out_prob = out_tok.split("\t")[6]
            out_prob = [i for i in out_prob.split("|") if i != "_"]
            out_meanings = [i for i in out_tok.split("\t")[5].split("|") if i != "_"]
            saldo = [i for i in SALDO[in_tok].strip(util.AFFIX).split(util.DELIM) if i]

            new_saldo = []
            if out_prob:
                for meaning in saldo:
                    if meaning in out_meanings:
                        i = out_meanings.index(meaning)
                        new_saldo.append(meaning + sensefmt % float(out_prob[i]))
                    else:
                        new_saldo.append(meaning + sensefmt % float(default_prob))
            else:
                new_saldo = [meaning + sensefmt % float(default_prob) for meaning in saldo]

            # Sort by probability
            new_saldo = sorted(new_saldo, key=lambda x: float(x.split(":")[-1]), reverse=True)
            OUT[in_tok] = util.cwbset(new_saldo)

    util.write_annotation(out, OUT)
예제 #4
0
def predict(model, order, struct, parent, word, out, pos, raw=False):
    """
    Predict a structural attribute.

    Both model and model.json must exist. See --train.
    """

    raw = raw == 'true'

    m_json = json.load(open(model + '.json'))

    data = (Example(None, text.words, text.span)
            for text in texts([(order, struct, parent, word, pos)],
                              map_label=lambda _: '?',
                              min_word_length=m_json['min_word_length'],
                              banned_pos=m_json['banned_pos']))

    index_to_label = m_json['index_to_label']

    args = ['--initial_regressor', model]

    if raw:
        predictions = ((span,
                        '|' + '|'.join(index_to_label[str(s)] + ':' + str(v)
                                       for s, v in ss) + '|')
                       for ss, span in vw_predict(args, data, raw=True))
    else:
        predictions = ((span, index_to_label[str(s)])
                       for s, span in vw_predict(args, data))

    util.write_annotation(out, predictions)
예제 #5
0
def annotate_parents(text, out, parent, child, ignore_missing_parent=False):
    """Annotate parent links; parent, child are names for existing annotations."""
    ignore_missing_parent = util.strtobool(ignore_missing_parent)
    parent_chunks, child_spans = read_parents_and_children(text, parent, child)
    OUT = {}
    previous_parent_id = None
    try:
        parent_span, parent_id = next(parent_chunks)
        for child_span, child_id in child_spans:
            while child_span.stop > parent_span.stop:
                if parent_id:
                    previous_parent_id = parent_id
                parent_span, parent_id = next(parent_chunks)
            if not parent_id or parent_span.start > child_span.start:
                if not ignore_missing_parent:
                    util.log.warning("Child '%s' missing parent; closest parent is %s",
                                     child_id, parent_id or previous_parent_id)
                parent_id = ""
            OUT[child_id] = parent_id
    except StopIteration:
        pass
    if out:
        util.write_annotation(out, OUT)
    else:
        return OUT
예제 #6
0
def number_relative(out, parent_children, prefix="", start=START_DEFAULT):
    """ Number chunks by their relative position within a parent. """
    PARENT_CHILDREN = util.read_annotation(parent_children)

    util.write_annotation(out, ((child, "%s%0*d" % (prefix, len(str(len(PARENT_CHILDREN[parent].split()) - 1 + start)), cnr))
                                for parent in PARENT_CHILDREN
                                for cnr, child in enumerate(PARENT_CHILDREN[parent].split(), start)))
예제 #7
0
def sentiment(sense,
              out_scores,
              out_labels,
              model,
              max_decimals=6,
              lexicon=None):
    """Assign sentiment values to tokens based on their sense annotation.
    When more than one sense is possible, calulate a weighted mean.
    - sense: existing annotation with saldoIDs.
    - out_scores, out_labels: resulting annotation file.
    - model: pickled lexicon with saldoIDs as keys.
    - max_decimals: int stating the amount of decimals the result is rounded to.
    - lexicon: this argument cannot be set from the command line,
      but is used in the catapult. This argument must be last.
    """

    if not lexicon:
        lexicon = util.PickledLexicon(model)
    # Otherwise use pre-loaded lexicon (from catapult)

    sense = util.read_annotation(sense)
    result_scores = {}
    result_labels = {}

    for token in sense:
        # Get set of senses for each token
        token_senses = dict([
            s.rsplit(util.SCORESEP, 1) if util.SCORESEP in s else (s, -1.0)
            for s in sense[token].split(util.DELIM) if s
        ])

        # Check for sense annotations and if any of the senses occur in the sentiment lexicon
        if token_senses and any(
                lexicon.lookup(s, (None, None))[1] for s in token_senses):
            sent_sum = 0.0
            labels_set = set()
            for s in token_senses:
                p = float(token_senses[s])
                if p < 0:
                    p = 1.0 / len(token_senses)
                sent_label, sent_score = lexicon.lookup(s, (None, None))
                if sent_label is not None:
                    labels_set.add(sent_label)
                    # Calculate weighted mean value
                    sent_sum += float(sent_score) * p
            result_scores[token] = str(round(sent_sum, max_decimals))
            # If there are multiple labels, derive label from polarity_score
            if len(labels_set) > 1:
                result_labels[token] = SENTIMENT_LABLES.get(round(sent_sum))
            else:
                result_labels[token] = SENTIMENT_LABLES.get(
                    int(list(labels_set)[0]))

        else:
            result_scores[token] = None
            result_labels[token] = None

    util.write_annotation(out_scores, result_scores)
    util.write_annotation(out_labels, result_labels)
예제 #8
0
def merge(out, left, right, separator=""):
    """Concatenate values from two annotations, with an optional separator.
       Removes superfluous separators"""
    b = util.read_annotation(right)
    OUT = {}

    for key_a, val_a in util.read_annotation_iteritems(left):
        val = [x for x in [val_a, b[key_a]] if x != separator]
        OUT[key_a] = separator.join(list(val)) if val else separator

    util.write_annotation(out, OUT)
예제 #9
0
def align_texts(sentence1, sentence2, link1, link2, sent_parents1, sent_parents2, out_sentlink1, out_sentlink2):
    """Make a more fine-grained sentence alignment between the current text (1) and a parallel reference text (2).
    - sentence1 and sentence2 contain information about which word-IDs there are in each sentence
    - link1 and link2 are existing annotations for the link IDs in the two texts
    - linkref2 is the existing annotation for the linkref IDs in text 2
    - sent_parents1 and sent_parents2 contain information about which sentences there are in each of the old sentence links
    - out_sentlink1 and out_sentlink2, are the resulting annotations for the new sentence links
    """

    REVERSED_LINK2 = {v: k for k, v in list(util.read_annotation(link2).items())}
    SENTPARENTS1 = util.read_annotation(sent_parents1)
    SENTPARENTS2 = util.read_annotation(sent_parents2)
    SENT1 = util.read_annotation(sentence1)
    SENT2 = util.read_annotation(sentence2)

    OUT_SENTLINK1 = {}
    OUT_SENTLINK2 = {}

    linkcounter = 0

    # Loop through existing links and split them into smaller units if possible (only if both links have text)
    for linkkey1, linkid in util.read_annotation_iteritems(link1):
        linkkey2 = REVERSED_LINK2[linkid]
        if linkkey1 in SENTPARENTS1 and linkkey2 in SENTPARENTS2:
            linkedsents1 = []
            linkedsents2 = []
            for sentid in SENTPARENTS1[linkkey1].split():
                linkedsents1.append((sentid, [w for w in SENT1[sentid].split()]))
            for sentid in SENTPARENTS2[linkkey2].split():
                linkedsents2.append((sentid, [w for w in SENT2[sentid].split()]))

            for s1, s2 in gachalign(linkedsents1, linkedsents2, mean="gacha"):
                linkcounter += 1
                if s1:
                    newlink1 = util.mkEdge('link', [util.edgeStart(s1[0]), util.edgeEnd(s1[-1])])
                    OUT_SENTLINK1[newlink1] = str(linkcounter)

                if s2:
                    newlink2 = util.mkEdge('link', [util.edgeStart(s2[0]), util.edgeEnd(s2[-1])])
                    OUT_SENTLINK2[newlink2] = str(linkcounter)

        # annotation if a link has text in one language but is empty in the other one
        elif linkkey1 in SENTPARENTS1 or linkkey2 in SENTPARENTS2:
            linkcounter += 1
            newlink1 = util.mkEdge('link', [util.edgeStart(linkkey1), util.edgeEnd(linkkey1)])
            OUT_SENTLINK1[newlink1] = str(linkcounter)
            newlink2 = util.mkEdge('link', [util.edgeStart(linkkey2), util.edgeEnd(linkkey2)])
            OUT_SENTLINK2[newlink2] = str(linkcounter)

    util.write_annotation(out_sentlink1, OUT_SENTLINK1)
    util.write_annotation(out_sentlink2, OUT_SENTLINK2)
예제 #10
0
def read_chunks_and_write_new_ordering(out, chunks, order, prefix="", start=START_DEFAULT):
    if isinstance(chunks, str):
        chunks = chunks.split()

    new_order = defaultdict(list)
    for chunknr, chunk in enumerate(chunks):
        for edge, val in util.read_annotation_iteritems(chunk):
            val = order(chunknr, edge, val)
            new_order[val].append(edge)

    nr_digits = len(str(len(new_order) - 1 + start))
    util.write_annotation(out, ((edge, "%s%0*d" % (prefix, nr_digits, nr))
                                for nr, key in enumerate(sorted(new_order), start)
                                for edge in new_order[key]))
예제 #11
0
def ovix_annot(order,
               text,
               parent_text,
               words,
               pos,
               out,
               skip_pos="MAD MID PAD",
               fmt="%.2f"):
    structs = [(text, parent_text)]
    columns = [words, pos]
    texts = cwb.vrt_iterate(*cwb.tokens_and_vrt(order, structs, columns))
    util.write_annotation(out,
                          ((span, fmt % ovix(actual_words(cols, skip_pos)))
                           for (_, span), cols in texts if span is not None))
예제 #12
0
def sentiment(sense,
              out_scores,
              out_labels,
              model,
              max_decimals=6,
              lexicon=None):
    """Assign sentiment values to tokens based on their sense annotation.
    When more than one sense is possible, calulate a weighted mean.
    - sense: existing annotation with saldoIDs.
    - out_scores, out_labels: resulting annotation file.
    - model: pickled lexicon with saldoIDs as keys.
    - max_decimals: int stating the amount of decimals the result is rounded to.
    - lexicon: this argument cannot be set from the command line,
      but is used in the catapult. This argument must be last.
    """

    if not lexicon:
        lexicon = util.PickledLexicon(model)
    # Otherwise use pre-loaded lexicon (from catapult)

    sense = util.read_annotation(sense)
    result_scores = {}
    result_labels = {}

    for token in sense:
        # Get set of senses for each token and sort them according to their probabilities
        token_senses = [
            tuple(s.rsplit(util.SCORESEP, 1)) if util.SCORESEP in s else
            (s, -1.0) for s in sense[token].split(util.DELIM) if s
        ]
        token_senses.sort(key=lambda x: x[1], reverse=True)

        # Lookup the sentiment score for the most probable sense and assign a sentiment label
        if token_senses:
            best_sense = token_senses[0][0]
            score = lexicon.lookup(best_sense, None)
        else:
            score = None

        if score:
            result_scores[token] = score
            result_labels[token] = SENTIMENT_LABLES.get(int(score))
        else:
            result_scores[token] = None
            result_labels[token] = None

    util.write_annotation(out_scores, result_scores)
    util.write_annotation(out_labels, result_labels)
예제 #13
0
def nominal_ratio_annot(order,
                        text,
                        parent_text,
                        pos,
                        out,
                        noun_pos="NN PP PC",
                        verb_pos="PN AB VB",
                        fmt="%.2f"):
    structs = [(text, parent_text)]
    columns = [pos]
    texts = cwb.vrt_iterate(*cwb.tokens_and_vrt(order, structs, columns))
    util.write_annotation(
        out,
        ((span, fmt % nominal_ratio([col[0]
                                     for col in cols], noun_pos, verb_pos))
         for (_, span), cols in texts if span is not None))
예제 #14
0
def contextual(out, chunk, context, ne, ne_subtype, text, model, method="populous", language=[], encoding="UTF-8"):
    """Annotate chunks with location data, based on locations contained within the text.
    context = text chunk to use for disambiguating places (when applicable).
    chunk = text chunk to which the annotation will be added.
    """

    if isinstance(language, str):
        language = language.split()

    model = load_model(model, language=language)

    text = util.read_corpus_text(text)
    chunk = util.read_annotation(chunk)
    context = util.read_annotation(context)
    ne = util.read_annotation(ne)
    ne_text = annotate.text_spans(text, ne, None)
    ne_subtype = util.read_annotation(ne_subtype)

    children_context_chunk = parent.annotate_children(text, None, context, chunk, ignore_missing_parent=True)
    children_chunk_ne = parent.annotate_children(text, None, chunk, ne, ignore_missing_parent=True)

    result = {}

    for cont, chunks in list(children_context_chunk.items()):
        all_locations = []  # TODO: Maybe not needed for anything?
        context_locations = []
        chunk_locations = defaultdict(list)

        for ch in chunks:
            for n in children_chunk_ne[ch]:
                if ne[n] == "LOC" and "PPL" in ne_subtype[n]:
                    location_text = ne_text[n].replace("\n", " ").replace("  ", " ")
                    location_data = model.get(location_text.lower())
                    if location_data:
                        all_locations.append((location_text, list(location_data)))
                        context_locations.append((location_text, list(location_data)))
                        chunk_locations[ch].append((location_text, list(location_data)))
                    else:
                        pass
                        # util.log.info("No location found for %s" % ne_text[n].replace("%", "%%"))

        chunk_locations = most_populous(chunk_locations)

        for c in chunks:
            result[c] = _format_location(chunk_locations.get(c, ()))

    util.write_annotation(out, result)
예제 #15
0
def align_texts(word1,
                word2,
                linktok1,
                linktok2,
                link1,
                link2,
                linkref2,
                out_wordlink,
                out_sentences,
                outindex1,
                outindex2,
                delimiter="|",
                affix="|"):
    """Make a word alignment between the current text (1) and a reference text (2). The texts need to be sentence aligned.
    word1 and word2 are existing annotations for the wordforms in the two texts
    linktok1 and linktok2 contain information about which words there are in each link
    link1 and link2 are existing annotations for the sentence link IDs in the two texts
    linkref2 is the existing annotation for the word linkref IDs in text 2
    out_wordlink is the resulting annotation for the word links (refers to linkrefs in text 2)
    out_sentences, outindex1 and outindex2 are internal files needed for fast_align and atools
    """

    LINKREF2 = util.read_annotation(linkref2)
    WORD1 = util.read_annotation(word1)
    WORD2 = util.read_annotation(word2)

    text1, text2 = make_sent_aligned_text(WORD1, WORD2, linktok1, linktok2,
                                          link1, link2, out_sentences)
    indices = word_align(out_sentences, outindex1, outindex2)

    # collect existing word links in a temporary dictionary
    TMP_WORDLINK = {}
    for indices, sent1, sent2 in zip(indices.split(b"\n"), text1, text2):
        for index_pair in indices.split():
            i, j = index_pair.split(b"-")
            tokid1 = sent1[int(i)]
            linklist = TMP_WORDLINK.get(tokid1, [])
            linklist.append(LINKREF2[sent2[int(j)]])
            TMP_WORDLINK[tokid1] = linklist

    # make final annotation including empty word links
    OUT_WORDLINK = {}
    for tokid in WORD1:
        OUT_WORDLINK[tokid] = affix + delimiter.join(TMP_WORDLINK.get(tokid, -1)) + affix \
            if TMP_WORDLINK.get(tokid, -1) != -1 else affix

    util.write_annotation(out_wordlink, OUT_WORDLINK)
def word_weights(doc: str = Document,
                 model: str = Model("[vw_topic_modelling.model]"),
                 word: str = Annotation("<token:word>"),
                 pos: str = Annotation("<token:pos>"),
                 out: str = Output("<token>:vw_topic_modelling:label_weights", description="Label weights per word")):
    """
    Report the weight for each label for each word.

    Both model and model.json must exist. See --train and --predict.
    """
    m_json = json.load(open(model + ".json"))
    index_to_label = m_json["index_to_label"]
    min_word_length = int(m_json["min_word_length"] or "0")
    banned_pos = (m_json["banned_pos"] or "").split()
    words = list(util.read_annotation(doc, word))
    poss = util.read_annotation(doc, pos) if pos else []
    data = (Example(None, vw_normalize(word))
            for n, word in enumerate(words)
            if len(word) >= min_word_length
            if not pos or poss[n] not in banned_pos)
    weights = defaultdict(list)
    with tempfile.NamedTemporaryFile() as tmp:
        args = ["--initial_regressor", model, "--invert_hash", tmp.name]
        for _ in vw_predict(args, data):
            pass
        for line in open(tmp.name, "r").readlines():
            # allmänna[1]:14342849:0.0139527
            colons = line.split(":")
            if len(colons) == 3:
                word, _hash, weight = colons
                if word[-1] == "]":
                    bracesplit = word.rsplit("[", 1)
                else:
                    bracesplit = []
                if len(bracesplit) == 2:
                    word, index = bracesplit
                    n = int(index[:-1]) + 1
                else:
                    n = 1
                weights[word].append(index_to_label[str(n)] + ":" + weight)
    ws = (
        util.cwbset(weights[vw_normalize(word)])
        for word in words
        if vw_normalize(word) in weights
    )
    util.write_annotation(doc, out, ws)
예제 #17
0
def annotate_standard(out,
                      input_annotation,
                      annotator,
                      extra_input='',
                      delimiter="|",
                      affix="|",
                      split=True):
    """
      Applies the 'annotator' function to the annotations in 'input_annotation' and writes the new output
      to 'out'. The annotator function should have type :: token_id -> oldannotations -> newannotations
      No support for multiword expressions
    - out is the output file
    - input_annotation is the given input annotation
    - f is the function which is to be applied to the input annotation
    - extra_input is an extra input annotation
    - delimiter is the delimiter character to put between ambiguous results
    - affix is an optional character to put before and after results
    - split defines if the input annatoation is a set, with elements separated by delimiter
      if so, return a list. Else, return one single element
    """
    def merge(d1, d2):
        result = dict(d1)
        for k, v in list(d2.items()):
            if k in result:
                result[k] = result[k] + delimiter + v
            else:
                result[k] = v
        return result

    LEMS = util.read_annotation(input_annotation)
    if extra_input:
        LEMS = merge(LEMS, util.read_annotation(extra_input))

    util.clear_annotation(out)
    OUT = {}

    for tokid in LEMS:
        thelems = LEMS[tokid]
        if split:
            thelems = [x for x in thelems.split(delimiter) if x != '']

        output_annotation = set(annotator(tokid, thelems))
        OUT[tokid] = affix + delimiter.join(
            list(output_annotation)) + affix if output_annotation else affix

    util.write_annotation(out, OUT)
예제 #18
0
def lix_annot(order,
              text,
              parent_text,
              sentence,
              parent_sentence,
              words,
              pos,
              out,
              skip_pos="MAD MID PAD",
              fmt="%.2f"):
    structs = [(text, parent_text), (sentence, parent_sentence)]
    columns = [words, pos]
    texts = cwb.vrt_iterate(*cwb.tokens_and_vrt(order, structs, columns),
                            trail=[0, 1])

    util.write_annotation(
        out, ((span, fmt % lix((actual_words(cols, skip_pos)
                                for _, cols in sentences)))
              for (_, span), sentences in texts if span is not None))
예제 #19
0
def mergemany(out, annotations, separator="|"):
    """Concatenate values from two or more annotations, with an optional separator.
       Removes superfluous separators"""
    # annotations = [util.read_annotation(a) for a in annotations]
    d = {}
    OUT = {}

    if isinstance(annotations, str):
        annotations = annotations.split()
    for annotation in [util.read_annotation(a) for a in annotations]:
        for key_a, val_a in list(annotation.items()):
            if val_a:
                d.setdefault(key_a, []).append(val_a)

    for key, lst in list(d.items()):
        OUT[key] = separator + separator.join(
            lst) + separator if lst else separator

    util.write_annotation(out, OUT)
예제 #20
0
def metadata(out, chunk, source, model, text=None, method="populous", language=[], encoding="UTF-8"):
    """Get location data based on metadata containing location names.
    """

    if isinstance(language, str):
        language = language.split()

    model = load_model(model, language=language)

    same_target_source = chunk == source
    chunk = util.read_annotation(chunk)
    source = util.read_annotation(source)

    # If location source and target chunk are not the same, we need
    # to find the parent/child relations between them.
    if not same_target_source and text:
        text = util.read_corpus_text(text)
        target_source_parents = parent.annotate_parents(text, None, source, chunk, ignore_missing_parent=True)

    result = {}
    chunk_locations = {}

    for c in chunk:
        if same_target_source:
            location_source = source.get(c)
        else:
            location_source = source.get(target_source_parents.get(c))

        if location_source:
            location_data = model.get(location_source.strip().lower())
            if location_data:
                chunk_locations[c] = [(location_source, list(location_data))]
        else:
            chunk_locations[c] = []

    chunk_locations = most_populous(chunk_locations)

    for c in chunk:
        result[c] = _format_location(chunk_locations.get(c, ()))

    util.write_annotation(out, result)
예제 #21
0
def fileid(out, files=None, filelist=None, prefix=""):
    """Creates unique IDs for every file in a list, using the filenames as seed.
    The resulting IDs are written to the file specified by 'out'."""

    assert files or filelist, "files or filelist must be specified"

    if filelist:
        with open(filelist, "r") as f:
            files = f.read().strip()

    files = files.split()
    files.sort()

    numfiles = len(files) * 2
    OUT = {}

    for f in files:
        util.resetIdent(f, numfiles)
        OUT[f] = prefix + util.mkIdent("", list(OUT.values()))

    util.write_annotation(out, OUT)
예제 #22
0
def msdtag(model, out, word, sentence, tag_mapping=None, morphtable=None, patterns=None, encoding=util.UTF8):
    """POS/MSD tag using the Hunpos tagger.
    """
    if isinstance(tag_mapping, str) and tag_mapping:
        tag_mapping = util.tagsets.__dict__[tag_mapping]
    elif tag_mapping is None or tag_mapping == "":
        tag_mapping = {}

    pattern_list = []

    if patterns:
        with open(patterns, mode="r", encoding="utf-8") as pat:
            for line in pat:
                if line.strip() and not line.startswith("#"):
                    name, pattern, tags = line.strip().split("\t", 2)
                    pattern_list.append((name, re.compile("^%s$" % pattern), tags))

    def replace_word(w):
        """ Replace word with alias if word matches a regex pattern. """
        for p in pattern_list:
            if re.match(p[1], w):
                return "[[%s]]" % p[0]
        return w

    sentences = [sent.split() for _, sent in util.read_annotation_iteritems(sentence)]
    WORD = util.read_annotation(word)
    stdin = SENT_SEP.join(TOK_SEP.join(replace_word(WORD[tokid]) for tokid in sent)
                          for sent in sentences)
    args = [model]
    if morphtable:
        args.extend(["-m", morphtable])
    stdout, _ = util.system.call_binary("hunpos-tag", args, stdin, encoding=encoding, verbose=True)

    OUT = {}
    for sent, tagged_sent in zip(sentences, stdout.strip().split(SENT_SEP)):
        for token_id, tagged_token in zip(sent, tagged_sent.strip().split(TOK_SEP)):
            tag = tagged_token.strip().split(TAG_SEP)[TAG_COLUMN]
            tag = tag_mapping.get(tag, tag)
            OUT[token_id] = tag
    util.write_annotation(out, OUT)
예제 #23
0
def word_weights(model, word, pos, out):
    """
    Report the weight for each label for each word.

    Both model and model.json must exist. See --train and --predict.
    """
    m_json = json.load(open(model + '.json'))
    index_to_label = m_json['index_to_label']
    min_word_length = int(m_json['min_word_length'] or '0')
    banned_pos = (m_json['banned_pos'] or '').split()
    words = util.read_annotation(word)
    poss = util.read_annotation(pos) if pos else {}
    data = (Example(None, vw_normalize(word))
            for span, word in list(words.items())
            if len(word) >= min_word_length
            if not pos or poss[span] not in banned_pos)
    weights = defaultdict(list)
    with tempfile.NamedTemporaryFile() as tmp:
        args = ['--initial_regressor', model, '--invert_hash', tmp.name]
        for _ in vw_predict(args, data):
            pass
        for line in open(tmp.name, 'r').readlines():
            # allmänna[1]:14342849:0.0139527
            colons = line.split(':')
            if len(colons) == 3:
                word, _hash, weight = colons
                if word[-1] == ']':
                    bracesplit = word.rsplit('[', 1)
                else:
                    bracesplit = []
                if len(bracesplit) == 2:
                    word, index = bracesplit
                    n = int(index[:-1]) + 1
                else:
                    n = 1
                weights[word].append(index_to_label[str(n)] + ':' + weight)
    ws = ((span, '|' + '|'.join(weights[vw_normalize(word)]) + '|')
          for span, word in list(words.items())
          if vw_normalize(word) in weights)
    util.write_annotation(out, ws)
예제 #24
0
def sentiment_class(out, sent, classes):
    """Translate numeric sentiment values into classes.
    - out: resulting annotation file.
    - sent: existing sentiment annotation.
    - classes: numeric spans and classes, on the format '0:0.33:negative|0.33:0.66:neutral|0.66:1:positive'."""

    classes = dict((tuple(float(n) for n in c.split(":")[:2]), c.split(":")[2])
                   for c in classes.split("|"))
    sent = util.read_annotation(sent)
    result = {}

    for token in sent:
        if not sent[token]:
            result[token] = None
            continue
        sent_value = float(sent[token])
        for c in classes:
            if c[0] <= sent_value <= c[1]:
                result[token] = classes[c]
                break

    util.write_annotation(out, result)
def predict(doc: str = Document,
            model: str = Model("[vw_topic_modelling.model]"),
            modeljson: str = Model("[vw_topic_modelling.modeljson]"),
            order,
            struct,
            parent: str = Annotation("{chunk}"),
            word: str = Annotation("<token:word>"),
            out: str = Output("{chunk}:vw_topic_modelling.prediction", description="Predicted attributes"),
            pos: str = Annotation("<token:pos>"),
            raw: bool = False):
    """Predict a structural attribute."""
    raw = raw == "true"

    m_json = json.load(open(modeljson))

    data = (
        Example(None, text.words, text.span)
        for text in texts([(order, struct, parent, word, pos)],
                          map_label=lambda _: "?",
                          min_word_length=m_json["min_word_length"],
                          banned_pos=m_json["banned_pos"])
    )

    index_to_label = m_json["index_to_label"]

    args = ["--initial_regressor", model]

    if raw:
        predictions = (
            util.cwbset(index_to_label[str(s)] + ":" + str(v) for s, v in ss)
            for ss, _span in vw_predict(args, data, raw=True)
        )
    else:
        predictions = (
            index_to_label[str(s)]
            for s, _span in vw_predict(args, data)
        )

    util.write_annotation(doc, out, predictions)
예제 #26
0
def annotate(out, lemgram, model, affix="|", delimiter="|"):
    """ Annotates each lemgram with its corresponding saldo_id,
        according to model (crosslink.pickle)
      - out is the resulting annotation file
      - lemgram is the existing annotations for lemgrams
      - model is the crosslink model
    """
    lexicon = PivotLexicon(model)
    WORD = util.read_annotation(lemgram)

    OUT = {}

    for tokid in WORD:
        saldo_ids = []
        for lemgram in WORD[tokid].split(delimiter):
            s_i = lexicon.get_exactMatch(lemgram)
            if s_i:
                saldo_ids += [s_i]
        OUT[tokid] = affix + delimiter.join(
            set(saldo_ids)) + affix if saldo_ids else affix

    util.write_annotation(out, OUT)
예제 #27
0
def tt_proc(model,
            tt_binary,
            out_pos,
            out_msd,
            out_lem,
            word,
            sentence,
            lang,
            encoding=util.UTF8):
    """POS/MSD tag and lemmatize using the TreeTagger.
    - model is the binary TreeTagger model file
    - tt_binary provides the path to the TreeTagger executable
    - out_pos, out_msd and out_lem are the resulting annotation files
    - word and sentence are existing annotation files
    - lang is the two-letter language code of the language to be analyzed
    """

    sentences = [
        sent.split() for _, sent in util.read_annotation_iteritems(sentence)
    ]
    WORD = util.read_annotation(word)
    stdin = SENT_SEP.join(
        TOK_SEP.join(WORD[tokid] for tokid in sent) for sent in sentences)
    args = [
        "-token", "-lemma", "-cap-heuristics", "-no-unknown", "-eos-tag",
        "<eos>", model
    ]

    stdout, _ = util.system.call_binary(tt_binary,
                                        args,
                                        stdin,
                                        encoding=encoding,
                                        verbose=True)

    # Write pos and msd annotations.
    OUT_POS = {}
    OUT_MSD = {}
    for sent, tagged_sent in zip(sentences, stdout.strip().split(SENT_SEP)):
        for token_id, tagged_token in zip(sent,
                                          tagged_sent.strip().split(TOK_SEP)):
            tag = tagged_token.strip().split(TAG_SEP)[TAG_COLUMN]
            OUT_MSD[token_id] = tag
            OUT_POS[token_id] = util.msd_to_pos.convert(tag, lang)
    util.write_annotation(out_msd, OUT_MSD)
    util.write_annotation(out_pos, OUT_POS)

    # Write lemma annotations.
    OUT_LEM = {}
    for sent, tagged_sent in zip(sentences, stdout.strip().split(SENT_SEP)):
        for token_id, tagged_token in zip(sent,
                                          tagged_sent.strip().split(TOK_SEP)):
            lem = tagged_token.strip().split(TAG_SEP)[LEM_COLUMN]
            OUT_LEM[token_id] = lem
    util.write_annotation(out_lem, OUT_LEM)
예제 #28
0
    def close(self):
        """This should be called at the end of the file. If in parser mode,
        it saves the corpus text and the annotations to files.
        """
        while self.tagstack:
            t, a, _ = self.tagstack[0]
            if t not in self.autoclose:
                util.log.error(self.pos() + "(at EOF) Autoclosing tag </%s>, starting at %s", t, a)
                self.errors = True
            else:
                util.log.info(self.pos() + "(at EOF) Autoclosing tag </%s>, starting at %s", t, a)
            self.handle_endtag(t)
        self.anchor()

        if self.skipped:
            new_elements = sorted(list(self.skipped.items()), key=lambda x: (-x[1], x[0]))
            new_elements_ann = " ".join(".".join([x[0][0].replace(":", "_"), x[0][1]]) if not x[0][1] is None else x[0][0].replace(":", "_") for x in new_elements)
            new_elements_ele = " ".join(":".join([x[0][0].replace(":", "\\:"), x[0][1]]) if not x[0][1] is None else x[0][0].replace(":", "\\:") for x in new_elements)
            if not self.elem_annotations:
                util.log.info("Found elements:")
                print()
                print("vrt_structs_annotations = " + new_elements_ann)
                print("vrt_structs             = " + new_elements_ele)
                print("xml_elements    = " + new_elements_ele)
                print("xml_annotations = " + new_elements_ann)
                print()
            else:
                print()
                print("xml_skip = " + new_elements_ele)
                print()

        # Only save results if no errors occured
        if not self.errors:
            text = u"".join(self.textbuffer)
            util.write_corpus_text(self.textfile, text, self.pos2anchor)
            if self.elem_order:
                for elem in self.elem_order:
                    annot, db = elem[1], self.dbs[elem[1]]
                    util.write_annotation(annot, db)
            else:
                for annot, db in list(self.dbs.items()):
                    util.write_annotation(annot, db)
            for header, db in list(self.header_dbs.items()):
                util.write_annotation(header, db)

        HTMLParser.close(self)
예제 #29
0
def do_segmentation(text,
                    element,
                    out,
                    chunk,
                    segmenter,
                    existing_segments=None,
                    model=None,
                    no_pickled_model=False):
    """Segment all "chunks" (e.g. sentences) into smaller "tokens" (e.g. words),
    and annotate them as "element" (e.g. w).
    Segmentation is done by the given "segmenter"; some segmenters take
    an extra argument which is a pickled "model" object.
    """
    if model:
        if not no_pickled_model:
            with open(model, "rb") as M:
                model = pickle.load(M, encoding='UTF-8')
        segmenter_args = (model, )
    else:
        segmenter_args = ()
    assert segmenter in SEGMENTERS, "Available segmenters: %s" % ", ".join(
        sorted(SEGMENTERS))
    segmenter = SEGMENTERS[segmenter]
    segmenter = segmenter(*segmenter_args)
    assert hasattr(
        segmenter, "span_tokenize"
    ), "Segmenter needs a 'span_tokenize' method: %r" % segmenter

    corpus_text, anchor2pos, pos2anchor = util.read_corpus_text(text)

    # First we read the chunks and partition the text into spans
    # E.g., "one two <s>three four</s> five <s>six</s>"
    #   ==> ["one two ", "three four", " five ", "six"]
    #   (but using spans (pairs of anchors) instead of strings)

    positions = set()
    for c in chunk.split():
        CHUNK = util.read_annotation(c)
        positions = positions.union(
            set(anchor2pos[anchor] for edge in CHUNK
                for span in util.edgeSpans(edge) for anchor in span))
    positions = sorted(set([0, len(corpus_text)]) | positions)
    chunk_spans = list(zip(positions, positions[1:]))

    if existing_segments:
        OUT = util.read_annotation(existing_segments)
        token_spans = sorted((anchor2pos[start], anchor2pos[end])
                             for edge in OUT
                             for (start, end) in util.edgeSpans(edge))
        for n, (chunkstart, chunkend) in enumerate(chunk_spans[:]):
            for tokenstart, tokenend in token_spans:
                if tokenend <= chunkstart:
                    continue
                if tokenstart >= chunkend:
                    break
                if chunkstart != tokenstart:
                    chunk_spans.append((chunkstart, tokenstart))
                chunkstart = tokenend
                chunk_spans[n] = (chunkstart, chunkend)
        chunk_spans.sort()
        util.log.info("Reorganized into %d chunks" % len(chunk_spans))
    else:
        OUT = {}

    # Now we can segment each chunk span into tokens
    for start, end in chunk_spans:
        for spanstart, spanend in segmenter.span_tokenize(
                corpus_text[start:end]):
            spanstart += start
            spanend += start
            if corpus_text[spanstart:spanend].strip():
                span = pos2anchor[spanstart], pos2anchor[spanend]
                edge = util.mkEdge(element, span)
                OUT[edge] = None

    util.write_annotation(out, OUT)
예제 #30
0
def annotate(out_complemgrams,
             out_compwf,
             out_baseform,
             word,
             msd,
             baseform_tmp,
             saldo_comp_model,
             nst_model,
             stats_model,
             complemgramfmt=util.SCORESEP + "%.3e",
             delimiter=util.DELIM,
             compdelim=util.COMPSEP,
             affix=util.AFFIX,
             cutoff=True,
             saldo_comp_lexicon=None,
             stats_lexicon=None):
    """Divides compound words into prefix(es) and suffix.
    - out_complemgram is the resulting annotation file for compound lemgrams
      and their probabilities
    - out_compwf is the resulting annotation file for compound wordforms
    - out_baseform is the resulting annotation file for baseforms (including baseforms for compounds)
    - word and msd are existing annotations for wordforms and MSDs
    - baseform_tmp is the existing temporary annotation file for baseforms (not including compounds)
    - saldo_comp_model is the Saldo compound model
    - nst_model is the NST part of speech compound model
    - stats_model is the statistics model (pickled file)
    - complemgramfmt is a format string for how to print the complemgram and its probability
      (use empty string to omit probablility)
    - saldo_comp_lexicon, stats_lexicon: these arguments cannot be set from the command line,
      but are used in the catapult. These arguments must be last.
    """

    ##################
    # Load models
    ##################
    if not saldo_comp_lexicon:
        saldo_comp_lexicon = SaldoCompLexicon(saldo_comp_model)

    with open(nst_model, "rb") as f:
        nst_model = pickle.load(f)

    if not stats_lexicon:
        stats_lexicon = StatsLexicon(stats_model)

    WORD = util.read_annotation(word)
    MSD = util.read_annotation(msd)

    # Create alternative lexicon (for words within the file)
    altlexicon = InFileLexicon(WORD, MSD)

    ##################
    # Do annotation
    ##################
    OUT_complem = {}
    OUT_compwf = {}
    OUT_baseform = {}
    IN_baseform = util.read_annotation(baseform_tmp)

    previous_compounds = {}

    for tokid in WORD:
        key = (WORD[tokid], MSD[tokid])
        if key in previous_compounds:
            compounds = previous_compounds[key]
        else:
            compounds = compound(saldo_comp_lexicon, altlexicon, WORD[tokid],
                                 MSD[tokid])

            if compounds:
                compounds = rank_compounds(compounds, nst_model, stats_lexicon)

                if cutoff:
                    # Only keep analyses with the same length (or +1) as the most probable one
                    best_length = len(compounds[0][1])
                    i = 0
                    for c in compounds:
                        if len(c[1]) > best_length + 1 or len(
                                c[1]) < best_length:
                            break

                        i += 1
                    compounds = compounds[:i]

            previous_compounds[key] = compounds

        # Create complem and compwf annotations
        make_complem_and_compwf(OUT_complem, OUT_compwf, complemgramfmt, tokid,
                                compounds, compdelim, delimiter, affix)

        # Create new baseform annotation if necessary
        if IN_baseform[tokid] != affix:
            OUT_baseform[tokid] = IN_baseform[tokid]
        else:
            make_new_baseforms(OUT_baseform, tokid, MSD[tokid], compounds,
                               stats_lexicon, altlexicon, delimiter, affix)

    util.write_annotation(out_complemgrams, OUT_complem)
    util.write_annotation(out_compwf, OUT_compwf)
    util.write_annotation(out_baseform, OUT_baseform)