def annotate(out_prefix,
             out_suffix,
             word,
             msd,
             model,
             delimiter="|",
             affix="|",
             lexicon=None):
    """Divides compound words into prefix and suffix.
    - out_prefix is the resulting annotation file for prefixes
    - out_suffix is the resulting annotation file for suffixes
    - word and msd are existing annotations for wordforms and MSDs
    - model is the Saldo compound model
    - lexicon: this argument cannot be set from the command line,
      but is used in the catapult. this argument must be last
    """

    if not lexicon:
        lexicon = SaldoLexicon(model)

    WORD = util.read_annotation(word)
    MSD = util.read_annotation(msd)

    OUT_p = {}
    OUT_s = {}

    for tokid in WORD:
        compounds = compound(lexicon, WORD[tokid], MSD[tokid])
        OUT_p[tokid] = affix + delimiter.join(set(
            c[0][1] for c in compounds)) + affix if compounds else affix
        OUT_s[tokid] = affix + delimiter.join(set(
            c[1][1] for c in compounds)) + affix if compounds else affix

    util.write_annotation(out_prefix, OUT_p)
    util.write_annotation(out_suffix, OUT_s)
예제 #2
0
def annotate_fallback(out,
                      word,
                      msd,
                      lemgram,
                      models,
                      key='lemgram',
                      lexicons=None):
    """ Annotates the words that does not already have a lemgram, according to model
        - out is the resulting annotation file
        - word is the words to be annotated
        - lemgram is the existing annotations for lemgram
        - model is the crosslink model
    """

    # catalaunch stuff
    if lexicons is None:
        models = models.split()
        lexicons = [saldo.SaldoLexicon(lex) for lex in models]

    WORD = util.read_annotation(word)
    MSD = util.read_annotation(msd)

    def annotate_empties(tokid, lemgrams):
        fallbacks = []
        if not lemgrams:
            word = WORD[tokid]
            msdtag = MSD[tokid]
            fallbacks.extend(getsingleannotation(lexicons, word, key, msdtag))

        return fallbacks

    annotate_standard(out, lemgram, annotate_empties)
예제 #3
0
def make_sent_aligned_text(WORD1, WORD2, linktok1, linktok2, link1, link2,
                           out_sentences):
    """ Make a sentence aligned text file (serves as input for fast_align)."""
    out_sent_linked = open(out_sentences, 'w', encoding='utf-8')
    LINKTOK1 = util.read_annotation(linktok1)
    LINKTOK2 = util.read_annotation(linktok2)
    REVERSED_LINK2 = {
        v: k
        for k, v in list(util.read_annotation(link2).items())
    }

    all_text1 = []
    all_text2 = []
    for linkkey1, linkid in util.read_annotation_iteritems(link1):
        # ignore links that don't exist in reference text
        if linkid in REVERSED_LINK2:
            linkkey2 = REVERSED_LINK2[linkid]
            # ignore empty links
            if linkkey1 in LINKTOK1 and linkkey2 in LINKTOK2:
                text1 = [(w, WORD1[w]) for w in LINKTOK1[linkkey1].split()]
                text2 = [(w, WORD2[w]) for w in LINKTOK2[linkkey2].split()]
                out_sent_linked.write(" ".join(w for span, w in text1) +
                                      " ||| " +
                                      " ".join(w for span, w in text2) + "\n")
                all_text1.append([span for span, w in text1])
                all_text2.append([span for span, w in text2])

    return (all_text1, all_text2)
예제 #4
0
def run_wsd(wsdjar, sense_model, context_model, out, sentence, word, ref, lemgram, saldo, pos, text,
            sensefmt=util.SCORESEP + "%.3f", default_prob="-1", encoding=util.UTF8):
    """
    Runs the word sense disambiguation tool (saldowsd.jar) to add probabilities to the saldo annotation.
    Unanalyzed senses (e.g. multiword expressions) receive the probability value given by default_prob.
      - wsdjar is the name of the java programme to be used for the wsd
      - sense_model and context_model are the models to be used with wsdjar
      - out is the resulting annotation file
      - sentence is an existing annotation for sentences and their children (words)
      - word is an existing annotations for wordforms
      - ref is an existing annotation for word references
      - lemgram and saldo are existing annotations for inflection tables and meanings
      - pos is an existing annotations for part-of-speech
      - text is an existing file with the input text and its anchors.
      - sensefmt is a format string for how to print the sense and its probability
      - default_prob is the default value for unanalyzed senses
    """

    WORD = util.read_annotation(word)
    REF = util.read_annotation(ref)
    LEMGRAM = util.read_annotation(lemgram)
    SALDO = util.read_annotation(saldo)
    POS = util.read_annotation(pos)
    textpos = util.read_corpus_text(text)[1]

    # Sort sentences according to their text position because WSD is context dependent.
    sentences = sorted(util.read_annotation_iteritems(sentence), key=lambda x: textpos[util.edgeStart(x[0])])
    sentences = [sent.split() for _, sent in sentences]

    # Start WSD process
    process = wsd_start(wsdjar, sense_model, context_model, encoding)

    # Construct input and send to WSD
    stdin = build_input(sentences, WORD, REF, LEMGRAM, SALDO, POS)
    if encoding:
        stdin = stdin.encode(encoding)

    stdout, stderr = process.communicate(stdin)
    # TODO: Solve hack line below!
    # Problem is that regular messages "Reading sense vectors.." are also piped to stderr.
    if len(stderr) > 52:
        util.system.kill_process(process)
        util.log.error(str(stderr))
        return

    if encoding:
        stdout = stdout.decode(encoding)

    process_output(out, stdout, sentences, SALDO, sensefmt, default_prob)

    # Kill running subprocess
    util.system.kill_process(process)
    return
예제 #5
0
def suc2hunpos(out, msd, sentences, word):
    WORD = util.read_annotation(word)
    MSD = util.read_annotation(msd)
    sentences = [sent.split() for _, sent in util.read_annotation_iteritems(sentences)]

    OUT = []

    for sentence in sentences:
        for tokid in sentence:
            OUT.append((WORD[tokid], MSD[tokid]))
        OUT.append(("", ""))

    write_hunsource(out, OUT)
예제 #6
0
def align_texts(sentence1, sentence2, link1, link2, sent_parents1, sent_parents2, out_sentlink1, out_sentlink2):
    """Make a more fine-grained sentence alignment between the current text (1) and a parallel reference text (2).
    - sentence1 and sentence2 contain information about which word-IDs there are in each sentence
    - link1 and link2 are existing annotations for the link IDs in the two texts
    - linkref2 is the existing annotation for the linkref IDs in text 2
    - sent_parents1 and sent_parents2 contain information about which sentences there are in each of the old sentence links
    - out_sentlink1 and out_sentlink2, are the resulting annotations for the new sentence links
    """

    REVERSED_LINK2 = {v: k for k, v in list(util.read_annotation(link2).items())}
    SENTPARENTS1 = util.read_annotation(sent_parents1)
    SENTPARENTS2 = util.read_annotation(sent_parents2)
    SENT1 = util.read_annotation(sentence1)
    SENT2 = util.read_annotation(sentence2)

    OUT_SENTLINK1 = {}
    OUT_SENTLINK2 = {}

    linkcounter = 0

    # Loop through existing links and split them into smaller units if possible (only if both links have text)
    for linkkey1, linkid in util.read_annotation_iteritems(link1):
        linkkey2 = REVERSED_LINK2[linkid]
        if linkkey1 in SENTPARENTS1 and linkkey2 in SENTPARENTS2:
            linkedsents1 = []
            linkedsents2 = []
            for sentid in SENTPARENTS1[linkkey1].split():
                linkedsents1.append((sentid, [w for w in SENT1[sentid].split()]))
            for sentid in SENTPARENTS2[linkkey2].split():
                linkedsents2.append((sentid, [w for w in SENT2[sentid].split()]))

            for s1, s2 in gachalign(linkedsents1, linkedsents2, mean="gacha"):
                linkcounter += 1
                if s1:
                    newlink1 = util.mkEdge('link', [util.edgeStart(s1[0]), util.edgeEnd(s1[-1])])
                    OUT_SENTLINK1[newlink1] = str(linkcounter)

                if s2:
                    newlink2 = util.mkEdge('link', [util.edgeStart(s2[0]), util.edgeEnd(s2[-1])])
                    OUT_SENTLINK2[newlink2] = str(linkcounter)

        # annotation if a link has text in one language but is empty in the other one
        elif linkkey1 in SENTPARENTS1 or linkkey2 in SENTPARENTS2:
            linkcounter += 1
            newlink1 = util.mkEdge('link', [util.edgeStart(linkkey1), util.edgeEnd(linkkey1)])
            OUT_SENTLINK1[newlink1] = str(linkcounter)
            newlink2 = util.mkEdge('link', [util.edgeStart(linkkey2), util.edgeEnd(linkkey2)])
            OUT_SENTLINK2[newlink2] = str(linkcounter)

    util.write_annotation(out_sentlink1, OUT_SENTLINK1)
    util.write_annotation(out_sentlink2, OUT_SENTLINK2)
예제 #7
0
def add(out, fileids, files=None, filelist=None, prefix=""):
    """ Adds IDs for new files to an existing list of file IDs, and removes missing ones. """

    assert files or filelist, "files or filelist must be specified"

    if filelist:
        with open(filelist, "r") as f:
            files = f.read().strip()

    files = files.split()
    files.sort()

    OUT = util.read_annotation(fileids)
    numfiles = (len(files) + len(OUT)) * 2

    # Add new files
    for f in files:
        if f not in OUT:
            util.resetIdent(f, numfiles)
            OUT[f] = prefix + util.mkIdent("", list(OUT.values()))
            util.log.info("File %s added.", f)

    # Remove deleted files
    todelete = []
    for f in OUT:
        if f not in files:
            todelete.append(f)
            util.log.info("File %s removed.", f)

    for f in todelete:
        del OUT[f]

    util.write_annotation(out, OUT)
예제 #8
0
def number_relative(out, parent_children, prefix="", start=START_DEFAULT):
    """ Number chunks by their relative position within a parent. """
    PARENT_CHILDREN = util.read_annotation(parent_children)

    util.write_annotation(out, ((child, "%s%0*d" % (prefix, len(str(len(PARENT_CHILDREN[parent].split()) - 1 + start)), cnr))
                                for parent in PARENT_CHILDREN
                                for cnr, child in enumerate(PARENT_CHILDREN[parent].split(), start)))
예제 #9
0
def align_texts(word1,
                word2,
                linktok1,
                linktok2,
                link1,
                link2,
                linkref2,
                out_wordlink,
                out_sentences,
                outindex1,
                outindex2,
                delimiter="|",
                affix="|"):
    """Make a word alignment between the current text (1) and a reference text (2). The texts need to be sentence aligned.
    word1 and word2 are existing annotations for the wordforms in the two texts
    linktok1 and linktok2 contain information about which words there are in each link
    link1 and link2 are existing annotations for the sentence link IDs in the two texts
    linkref2 is the existing annotation for the word linkref IDs in text 2
    out_wordlink is the resulting annotation for the word links (refers to linkrefs in text 2)
    out_sentences, outindex1 and outindex2 are internal files needed for fast_align and atools
    """

    LINKREF2 = util.read_annotation(linkref2)
    WORD1 = util.read_annotation(word1)
    WORD2 = util.read_annotation(word2)

    text1, text2 = make_sent_aligned_text(WORD1, WORD2, linktok1, linktok2,
                                          link1, link2, out_sentences)
    indices = word_align(out_sentences, outindex1, outindex2)

    # collect existing word links in a temporary dictionary
    TMP_WORDLINK = {}
    for indices, sent1, sent2 in zip(indices.split(b"\n"), text1, text2):
        for index_pair in indices.split():
            i, j = index_pair.split(b"-")
            tokid1 = sent1[int(i)]
            linklist = TMP_WORDLINK.get(tokid1, [])
            linklist.append(LINKREF2[sent2[int(j)]])
            TMP_WORDLINK[tokid1] = linklist

    # make final annotation including empty word links
    OUT_WORDLINK = {}
    for tokid in WORD1:
        OUT_WORDLINK[tokid] = affix + delimiter.join(TMP_WORDLINK.get(tokid, -1)) + affix \
            if TMP_WORDLINK.get(tokid, -1) != -1 else affix

    util.write_annotation(out_wordlink, OUT_WORDLINK)
예제 #10
0
def vrt_table(annotations_structs, annotations_columns):
    """
    Return a table suitable for printing as a vrt file from annotations.

    The structs are a pair of annotation and its parent.
    """
    structs_count = len(annotations_structs)
    parents = {}
    for annot, parent_annotation in annotations_structs:
        if parent_annotation not in parents:
            parents[parent_annotation] = util.read_annotation(parent_annotation)

    vrt = defaultdict(ListWithGet)

    for n, (annot, parent_annotation) in enumerate(annotations_structs):
        # Enumerate structural attributes, to handle attributes without values
        enumerated_struct = {
            span: [index, value, span]
            for index, (span, value)
            in enumerate(list(util.read_annotation(annot).items()), 1)
            # Must enumerate from 1, due to the use of any() later
        }
        token_annotations = (
            (word_tok, enumerated_struct.get(tok_span))
            for word_tok, tok_span
            in list(parents[parent_annotation].items())
        )
        for tok, value in token_annotations:
            if not value:
                # This happens for tokens that are outside the structural
                # attribute, such as b in "<text>a</text> b"
                value = ["", "", None]

            value[1] = "|" if value[1] == "|/|" else value[1]
            value[1] = value[1].replace("\n", " ") if value[1] else ""
            vrt[tok].append(value)

    for n, annot in enumerate(annotations_columns):
        n += structs_count
        annotation = util.read_annotation(annot)
        for key in vrt.keys():
            value = annotation.get(key, UNDEF)
            if n > structs_count:  # Any column except the first (the word)
                value = "|" if value == "|/|" else value
            vrt[key].append(value.replace("\n", " "))

    return vrt
예제 #11
0
def contextual(out, chunk, context, ne, ne_subtype, text, model, method="populous", language=[], encoding="UTF-8"):
    """Annotate chunks with location data, based on locations contained within the text.
    context = text chunk to use for disambiguating places (when applicable).
    chunk = text chunk to which the annotation will be added.
    """

    if isinstance(language, str):
        language = language.split()

    model = load_model(model, language=language)

    text = util.read_corpus_text(text)
    chunk = util.read_annotation(chunk)
    context = util.read_annotation(context)
    ne = util.read_annotation(ne)
    ne_text = annotate.text_spans(text, ne, None)
    ne_subtype = util.read_annotation(ne_subtype)

    children_context_chunk = parent.annotate_children(text, None, context, chunk, ignore_missing_parent=True)
    children_chunk_ne = parent.annotate_children(text, None, chunk, ne, ignore_missing_parent=True)

    result = {}

    for cont, chunks in list(children_context_chunk.items()):
        all_locations = []  # TODO: Maybe not needed for anything?
        context_locations = []
        chunk_locations = defaultdict(list)

        for ch in chunks:
            for n in children_chunk_ne[ch]:
                if ne[n] == "LOC" and "PPL" in ne_subtype[n]:
                    location_text = ne_text[n].replace("\n", " ").replace("  ", " ")
                    location_data = model.get(location_text.lower())
                    if location_data:
                        all_locations.append((location_text, list(location_data)))
                        context_locations.append((location_text, list(location_data)))
                        chunk_locations[ch].append((location_text, list(location_data)))
                    else:
                        pass
                        # util.log.info("No location found for %s" % ne_text[n].replace("%", "%%"))

        chunk_locations = most_populous(chunk_locations)

        for c in chunks:
            result[c] = _format_location(chunk_locations.get(c, ()))

    util.write_annotation(out, result)
예제 #12
0
def sentiment(sense,
              out_scores,
              out_labels,
              model,
              max_decimals=6,
              lexicon=None):
    """Assign sentiment values to tokens based on their sense annotation.
    When more than one sense is possible, calulate a weighted mean.
    - sense: existing annotation with saldoIDs.
    - out_scores, out_labels: resulting annotation file.
    - model: pickled lexicon with saldoIDs as keys.
    - max_decimals: int stating the amount of decimals the result is rounded to.
    - lexicon: this argument cannot be set from the command line,
      but is used in the catapult. This argument must be last.
    """

    if not lexicon:
        lexicon = util.PickledLexicon(model)
    # Otherwise use pre-loaded lexicon (from catapult)

    sense = util.read_annotation(sense)
    result_scores = {}
    result_labels = {}

    for token in sense:
        # Get set of senses for each token
        token_senses = dict([
            s.rsplit(util.SCORESEP, 1) if util.SCORESEP in s else (s, -1.0)
            for s in sense[token].split(util.DELIM) if s
        ])

        # Check for sense annotations and if any of the senses occur in the sentiment lexicon
        if token_senses and any(
                lexicon.lookup(s, (None, None))[1] for s in token_senses):
            sent_sum = 0.0
            labels_set = set()
            for s in token_senses:
                p = float(token_senses[s])
                if p < 0:
                    p = 1.0 / len(token_senses)
                sent_label, sent_score = lexicon.lookup(s, (None, None))
                if sent_label is not None:
                    labels_set.add(sent_label)
                    # Calculate weighted mean value
                    sent_sum += float(sent_score) * p
            result_scores[token] = str(round(sent_sum, max_decimals))
            # If there are multiple labels, derive label from polarity_score
            if len(labels_set) > 1:
                result_labels[token] = SENTIMENT_LABLES.get(round(sent_sum))
            else:
                result_labels[token] = SENTIMENT_LABLES.get(
                    int(list(labels_set)[0]))

        else:
            result_scores[token] = None
            result_labels[token] = None

    util.write_annotation(out_scores, result_scores)
    util.write_annotation(out_labels, result_labels)
예제 #13
0
def tag_ne(out_ne_ex,
           out_ne_type,
           out_ne_subtype,
           out_ne_name,
           word,
           sentence,
           encoding=util.UTF8,
           process_dict=None):
    """
    Tag named entities using HFST-SweNER.
    SweNER is either run in an already started process defined in
    process_dict, or a new process is started(default)
    - out_ne_ex, out_ne_type and out_ne_subtype are resulting annotation files for the named entities
    - word and sentence are existing annotation files for wordforms and sentences
    - process_dict should never be set from the command line
    """

    if process_dict is None:
        process = swenerstart("", encoding, verbose=False)
    # else:
    #     process = process_dict['process']
    #     # If process seems dead, spawn a new one
    #     if process.stdin.closed or process.stdout.closed or process.poll():
    #         util.system.kill_process(process)
    #         process = swenerstart("", encoding, verbose=False)
    #         process_dict['process'] = process

    # Collect all text
    sentences = [
        sent.split() for _, sent in util.read_annotation_iteritems(sentence)
    ]
    word_file = util.read_annotation(word)
    stdin = SENT_SEP.join(
        TOK_SEP.join(word_file[tokid] for tokid in sent) for sent in sentences)
    # Escape <, > and &
    stdin = xml.sax.saxutils.escape(stdin)

    # keep_process = len(stdin) < RESTART_THRESHOLD_LENGTH and process_dict is not None
    # util.log.info("Stdin length: %s, keep process: %s", len(stdin), keep_process)

    # if process_dict is not None:
    #     process_dict['restart'] = not keep_process

    # # Does not work as of now since swener does not have an interactive mode
    # if keep_process:
    #     # Chatting with swener: send a SENT_SEP and read correct number of lines
    #     stdin_fd, stdout_fd = process.stdin, process.stdout
    #     stdin_fd.write(stdin.encode(encoding) + SENT_SEP)
    #     stdin_fd.flush()
    #     stout = stdout_fd.readlines()

    # else:
    # Otherwise use communicate which buffers properly
    # util.log.info("STDIN %s %s", type(stdin.encode(encoding)), stdin.encode(encoding))
    stdout, _ = process.communicate(stdin.encode(encoding))
    # util.log.info("STDOUT %s %s", type(stdout.decode(encoding)), stdout.decode(encoding))

    parse_swener_output(sentences, stdout.decode(encoding), out_ne_ex,
                        out_ne_type, out_ne_subtype, out_ne_name)
예제 #14
0
def annotate_standard(out,
                      input_annotation,
                      annotator,
                      extra_input='',
                      delimiter="|",
                      affix="|",
                      split=True):
    """
      Applies the 'annotator' function to the annotations in 'input_annotation' and writes the new output
      to 'out'. The annotator function should have type :: token_id -> oldannotations -> newannotations
      No support for multiword expressions
    - out is the output file
    - input_annotation is the given input annotation
    - f is the function which is to be applied to the input annotation
    - extra_input is an extra input annotation
    - delimiter is the delimiter character to put between ambiguous results
    - affix is an optional character to put before and after results
    - split defines if the input annatoation is a set, with elements separated by delimiter
      if so, return a list. Else, return one single element
    """
    def merge(d1, d2):
        result = dict(d1)
        for k, v in list(d2.items()):
            if k in result:
                result[k] = result[k] + delimiter + v
            else:
                result[k] = v
        return result

    LEMS = util.read_annotation(input_annotation)
    if extra_input:
        LEMS = merge(LEMS, util.read_annotation(extra_input))

    util.clear_annotation(out)
    OUT = {}

    for tokid in LEMS:
        thelems = LEMS[tokid]
        if split:
            thelems = [x for x in thelems.split(delimiter) if x != '']

        output_annotation = set(annotator(tokid, thelems))
        OUT[tokid] = affix + delimiter.join(
            list(output_annotation)) + affix if output_annotation else affix

    util.write_annotation(out, OUT)
예제 #15
0
def tokens_and_vrt(order, annotations_structs, annotations_columns):
    """
    Returns the tokens in order and the vrt table.
    """
    vrt = vrt_table(annotations_structs, annotations_columns)
    sortkey = util.read_annotation(order).get
    tokens = sorted(vrt, key=sortkey)
    return tokens, vrt
def word_weights(doc: str = Document,
                 model: str = Model("[vw_topic_modelling.model]"),
                 word: str = Annotation("<token:word>"),
                 pos: str = Annotation("<token:pos>"),
                 out: str = Output("<token>:vw_topic_modelling:label_weights", description="Label weights per word")):
    """
    Report the weight for each label for each word.

    Both model and model.json must exist. See --train and --predict.
    """
    m_json = json.load(open(model + ".json"))
    index_to_label = m_json["index_to_label"]
    min_word_length = int(m_json["min_word_length"] or "0")
    banned_pos = (m_json["banned_pos"] or "").split()
    words = list(util.read_annotation(doc, word))
    poss = util.read_annotation(doc, pos) if pos else []
    data = (Example(None, vw_normalize(word))
            for n, word in enumerate(words)
            if len(word) >= min_word_length
            if not pos or poss[n] not in banned_pos)
    weights = defaultdict(list)
    with tempfile.NamedTemporaryFile() as tmp:
        args = ["--initial_regressor", model, "--invert_hash", tmp.name]
        for _ in vw_predict(args, data):
            pass
        for line in open(tmp.name, "r").readlines():
            # allmänna[1]:14342849:0.0139527
            colons = line.split(":")
            if len(colons) == 3:
                word, _hash, weight = colons
                if word[-1] == "]":
                    bracesplit = word.rsplit("[", 1)
                else:
                    bracesplit = []
                if len(bracesplit) == 2:
                    word, index = bracesplit
                    n = int(index[:-1]) + 1
                else:
                    n = 1
                weights[word].append(index_to_label[str(n)] + ":" + weight)
    ws = (
        util.cwbset(weights[vw_normalize(word)])
        for word in words
        if vw_normalize(word) in weights
    )
    util.write_annotation(doc, out, ws)
예제 #17
0
def number_by_parent(out, chunks, parent_order, parent_children, prefix="", start=START_DEFAULT):
    """ Number chunks by (parent order, chunk order). """
    PARENT_CHILDREN = util.read_annotation(parent_children)
    CHILD_ORDER = dict((cid, (pnr, cnr))
                       for (pid, pnr) in util.read_annotation_iteritems(parent_order)
                       for (cnr, cid) in enumerate(PARENT_CHILDREN.get(pid, "").split()))

    def order(chunknr, edge, _value):
        return (chunknr, CHILD_ORDER.get(edge))

    read_chunks_and_write_new_ordering(out, chunks, order, prefix, start)
예제 #18
0
def tt_proc(model,
            tt_binary,
            out_pos,
            out_msd,
            out_lem,
            word,
            sentence,
            lang,
            encoding=util.UTF8):
    """POS/MSD tag and lemmatize using the TreeTagger.
    - model is the binary TreeTagger model file
    - tt_binary provides the path to the TreeTagger executable
    - out_pos, out_msd and out_lem are the resulting annotation files
    - word and sentence are existing annotation files
    - lang is the two-letter language code of the language to be analyzed
    """

    sentences = [
        sent.split() for _, sent in util.read_annotation_iteritems(sentence)
    ]
    WORD = util.read_annotation(word)
    stdin = SENT_SEP.join(
        TOK_SEP.join(WORD[tokid] for tokid in sent) for sent in sentences)
    args = [
        "-token", "-lemma", "-cap-heuristics", "-no-unknown", "-eos-tag",
        "<eos>", model
    ]

    stdout, _ = util.system.call_binary(tt_binary,
                                        args,
                                        stdin,
                                        encoding=encoding,
                                        verbose=True)

    # Write pos and msd annotations.
    OUT_POS = {}
    OUT_MSD = {}
    for sent, tagged_sent in zip(sentences, stdout.strip().split(SENT_SEP)):
        for token_id, tagged_token in zip(sent,
                                          tagged_sent.strip().split(TOK_SEP)):
            tag = tagged_token.strip().split(TAG_SEP)[TAG_COLUMN]
            OUT_MSD[token_id] = tag
            OUT_POS[token_id] = util.msd_to_pos.convert(tag, lang)
    util.write_annotation(out_msd, OUT_MSD)
    util.write_annotation(out_pos, OUT_POS)

    # Write lemma annotations.
    OUT_LEM = {}
    for sent, tagged_sent in zip(sentences, stdout.strip().split(SENT_SEP)):
        for token_id, tagged_token in zip(sent,
                                          tagged_sent.strip().split(TOK_SEP)):
            lem = tagged_token.strip().split(TAG_SEP)[LEM_COLUMN]
            OUT_LEM[token_id] = lem
    util.write_annotation(out_lem, OUT_LEM)
예제 #19
0
def posset(out, pos, separator="|"):
    """Concatenate values from two annotations, with an optional separator.
       Removes superfluous separators"""
    oldpos = util.read_annotation(pos)
    OUT = {}

    # dummy function to annotated thepos with separators
    def makeset(tokid, thepos):
        return [thepos]

    annotate_standard(out, pos, makeset, split=False)
예제 #20
0
def merge(out, left, right, separator=""):
    """Concatenate values from two annotations, with an optional separator.
       Removes superfluous separators"""
    b = util.read_annotation(right)
    OUT = {}

    for key_a, val_a in util.read_annotation_iteritems(left):
        val = [x for x in [val_a, b[key_a]] if x != separator]
        OUT[key_a] = separator.join(list(val)) if val else separator

    util.write_annotation(out, OUT)
예제 #21
0
def metadata(out, chunk, source, model, text=None, method="populous", language=[], encoding="UTF-8"):
    """Get location data based on metadata containing location names.
    """

    if isinstance(language, str):
        language = language.split()

    model = load_model(model, language=language)

    same_target_source = chunk == source
    chunk = util.read_annotation(chunk)
    source = util.read_annotation(source)

    # If location source and target chunk are not the same, we need
    # to find the parent/child relations between them.
    if not same_target_source and text:
        text = util.read_corpus_text(text)
        target_source_parents = parent.annotate_parents(text, None, source, chunk, ignore_missing_parent=True)

    result = {}
    chunk_locations = {}

    for c in chunk:
        if same_target_source:
            location_source = source.get(c)
        else:
            location_source = source.get(target_source_parents.get(c))

        if location_source:
            location_data = model.get(location_source.strip().lower())
            if location_data:
                chunk_locations[c] = [(location_source, list(location_data))]
        else:
            chunk_locations[c] = []

    chunk_locations = most_populous(chunk_locations)

    for c in chunk:
        result[c] = _format_location(chunk_locations.get(c, ()))

    util.write_annotation(out, result)
예제 #22
0
def write_formatted(out, annotations_columns, annotations_structs, columns, structs, structs_count, text):
    """
    The 'formatted' XML part of the 'export' function: export xml with the same
    whitespace and indentation as in the original.
    """
    txt, anchor2pos, pos2anchor = util.corpus.read_corpus_text(text)
    structs_order = ["__token__"] + [s[0] for s in structs]
    anchors = defaultdict(dict)
    for elem, attrs in structs:
        for attr in attrs:
            struct = util.read_annotation(annotations_structs[attr[1]][0])
            for edge in struct:
                if util.edgeStart(edge) == util.edgeEnd(edge):
                    anchors[util.edgeStart(edge)].setdefault("structs", {}).setdefault((elem, anchor2pos[util.edgeEnd(edge)], "close"), []).append((attr[0], struct[edge]))
                else:
                    anchors[util.edgeStart(edge)].setdefault("structs", {}).setdefault((elem, anchor2pos[util.edgeEnd(edge)]), []).append((attr[0], struct[edge]))
                    anchors[util.edgeEnd(edge)].setdefault("close", set()).add((elem, edge))
    for n, annot in enumerate(annotations_columns):
        n += structs_count
        for tok, value in util.read_annotation_iteritems(annot):
            if n > structs_count:  # Any column except the first (the word)
                value = "|" if value == "|/|" else value
            anchors[util.edgeStart(tok)].setdefault("token", []).append(value.replace("\n", " "))
            if n == structs_count:
                anchors[util.edgeEnd(tok)].setdefault("close", set()).add(("__token__", None))
    currpos = 0

    with open(out, "w") as OUT:
        OUT.write("<corpus>")
        for pos, anchor in sorted(list(pos2anchor.items()), key=lambda x: x[0]):
            OUT.write(txt[currpos:pos].replace("&", "&amp;").replace("<", "&lt;").replace(">", "&gt;"))
            if anchor in anchors:
                if "close" in anchors[anchor]:
                    if ("__token__", None) in anchors[anchor]["close"]:
                        OUT.write("</w>")
                    OUT.write("".join("</%s>" % e[0] for e in sorted(anchors[anchor]["close"], key=lambda x: structs_order.index(x[0])) if not e[0] == "__token__"))

                if "structs" in anchors[anchor]:
                    for elem, annot in sorted(iter(list(anchors[anchor]["structs"].items())), key=lambda x: (-x[0][1], -structs_order.index(x[0][0]))):
                        if elem not in ("close", "token"):
                            attrstring = "".join(' %s="%s"' % (attr, val.replace("&", "&amp;").replace('"', "&quot;").replace("<", "&lt;").replace(">", "&gt;"))
                                                 for (attr, val) in annot if val and not attr == UNDEF)
                            close = "/" if len(elem) == 3 else ""
                            OUT.write("<%s%s%s>" % (elem[0], attrstring, close))

                if "token" in anchors[anchor]:
                    attrstring = "".join(' %s="%s"' % (columns[i + 1], a.replace("&", "&amp;").replace('"', '&quot;').replace("<", "&lt;").replace(">", "&gt;"))
                                         for i, a in enumerate(anchors[anchor]["token"][1:]) if a)
                    OUT.write("<w%s>" % attrstring)

            currpos = pos
        OUT.write("</corpus>")
    util.log.info("Exported: %s", out)
예제 #23
0
def word_weights(model, word, pos, out):
    """
    Report the weight for each label for each word.

    Both model and model.json must exist. See --train and --predict.
    """
    m_json = json.load(open(model + '.json'))
    index_to_label = m_json['index_to_label']
    min_word_length = int(m_json['min_word_length'] or '0')
    banned_pos = (m_json['banned_pos'] or '').split()
    words = util.read_annotation(word)
    poss = util.read_annotation(pos) if pos else {}
    data = (Example(None, vw_normalize(word))
            for span, word in list(words.items())
            if len(word) >= min_word_length
            if not pos or poss[span] not in banned_pos)
    weights = defaultdict(list)
    with tempfile.NamedTemporaryFile() as tmp:
        args = ['--initial_regressor', model, '--invert_hash', tmp.name]
        for _ in vw_predict(args, data):
            pass
        for line in open(tmp.name, 'r').readlines():
            # allmänna[1]:14342849:0.0139527
            colons = line.split(':')
            if len(colons) == 3:
                word, _hash, weight = colons
                if word[-1] == ']':
                    bracesplit = word.rsplit('[', 1)
                else:
                    bracesplit = []
                if len(bracesplit) == 2:
                    word, index = bracesplit
                    n = int(index[:-1]) + 1
                else:
                    n = 1
                weights[word].append(index_to_label[str(n)] + ':' + weight)
    ws = ((span, '|' + '|'.join(weights[vw_normalize(word)]) + '|')
          for span, word in list(words.items())
          if vw_normalize(word) in weights)
    util.write_annotation(out, ws)
예제 #24
0
def sentiment(sense,
              out_scores,
              out_labels,
              model,
              max_decimals=6,
              lexicon=None):
    """Assign sentiment values to tokens based on their sense annotation.
    When more than one sense is possible, calulate a weighted mean.
    - sense: existing annotation with saldoIDs.
    - out_scores, out_labels: resulting annotation file.
    - model: pickled lexicon with saldoIDs as keys.
    - max_decimals: int stating the amount of decimals the result is rounded to.
    - lexicon: this argument cannot be set from the command line,
      but is used in the catapult. This argument must be last.
    """

    if not lexicon:
        lexicon = util.PickledLexicon(model)
    # Otherwise use pre-loaded lexicon (from catapult)

    sense = util.read_annotation(sense)
    result_scores = {}
    result_labels = {}

    for token in sense:
        # Get set of senses for each token and sort them according to their probabilities
        token_senses = [
            tuple(s.rsplit(util.SCORESEP, 1)) if util.SCORESEP in s else
            (s, -1.0) for s in sense[token].split(util.DELIM) if s
        ]
        token_senses.sort(key=lambda x: x[1], reverse=True)

        # Lookup the sentiment score for the most probable sense and assign a sentiment label
        if token_senses:
            best_sense = token_senses[0][0]
            score = lexicon.lookup(best_sense, None)
        else:
            score = None

        if score:
            result_scores[token] = score
            result_labels[token] = SENTIMENT_LABLES.get(int(score))
        else:
            result_scores[token] = None
            result_labels[token] = None

    util.write_annotation(out_scores, result_scores)
    util.write_annotation(out_labels, result_labels)
예제 #25
0
def mergemany(out, annotations, separator="|"):
    """Concatenate values from two or more annotations, with an optional separator.
       Removes superfluous separators"""
    # annotations = [util.read_annotation(a) for a in annotations]
    d = {}
    OUT = {}

    if isinstance(annotations, str):
        annotations = annotations.split()
    for annotation in [util.read_annotation(a) for a in annotations]:
        for key_a, val_a in list(annotation.items()):
            if val_a:
                d.setdefault(key_a, []).append(val_a)

    for key, lst in list(d.items()):
        OUT[key] = separator + separator.join(
            lst) + separator if lst else separator

    util.write_annotation(out, OUT)
예제 #26
0
def msdtag(model, out, word, sentence, tag_mapping=None, morphtable=None, patterns=None, encoding=util.UTF8):
    """POS/MSD tag using the Hunpos tagger.
    """
    if isinstance(tag_mapping, str) and tag_mapping:
        tag_mapping = util.tagsets.__dict__[tag_mapping]
    elif tag_mapping is None or tag_mapping == "":
        tag_mapping = {}

    pattern_list = []

    if patterns:
        with open(patterns, mode="r", encoding="utf-8") as pat:
            for line in pat:
                if line.strip() and not line.startswith("#"):
                    name, pattern, tags = line.strip().split("\t", 2)
                    pattern_list.append((name, re.compile("^%s$" % pattern), tags))

    def replace_word(w):
        """ Replace word with alias if word matches a regex pattern. """
        for p in pattern_list:
            if re.match(p[1], w):
                return "[[%s]]" % p[0]
        return w

    sentences = [sent.split() for _, sent in util.read_annotation_iteritems(sentence)]
    WORD = util.read_annotation(word)
    stdin = SENT_SEP.join(TOK_SEP.join(replace_word(WORD[tokid]) for tokid in sent)
                          for sent in sentences)
    args = [model]
    if morphtable:
        args.extend(["-m", morphtable])
    stdout, _ = util.system.call_binary("hunpos-tag", args, stdin, encoding=encoding, verbose=True)

    OUT = {}
    for sent, tagged_sent in zip(sentences, stdout.strip().split(SENT_SEP)):
        for token_id, tagged_token in zip(sent, tagged_sent.strip().split(TOK_SEP)):
            tag = tagged_token.strip().split(TAG_SEP)[TAG_COLUMN]
            tag = tag_mapping.get(tag, tag)
            OUT[token_id] = tag
    util.write_annotation(out, OUT)
예제 #27
0
def annotate(out, lemgram, model, affix="|", delimiter="|"):
    """ Annotates each lemgram with its corresponding saldo_id,
        according to model (crosslink.pickle)
      - out is the resulting annotation file
      - lemgram is the existing annotations for lemgrams
      - model is the crosslink model
    """
    lexicon = PivotLexicon(model)
    WORD = util.read_annotation(lemgram)

    OUT = {}

    for tokid in WORD:
        saldo_ids = []
        for lemgram in WORD[tokid].split(delimiter):
            s_i = lexicon.get_exactMatch(lemgram)
            if s_i:
                saldo_ids += [s_i]
        OUT[tokid] = affix + delimiter.join(
            set(saldo_ids)) + affix if saldo_ids else affix

    util.write_annotation(out, OUT)
예제 #28
0
def sentiment_class(out, sent, classes):
    """Translate numeric sentiment values into classes.
    - out: resulting annotation file.
    - sent: existing sentiment annotation.
    - classes: numeric spans and classes, on the format '0:0.33:negative|0.33:0.66:neutral|0.66:1:positive'."""

    classes = dict((tuple(float(n) for n in c.split(":")[:2]), c.split(":")[2])
                   for c in classes.split("|"))
    sent = util.read_annotation(sent)
    result = {}

    for token in sent:
        if not sent[token]:
            result[token] = None
            continue
        sent_value = float(sent[token])
        for c in classes:
            if c[0] <= sent_value <= c[1]:
                result[token] = classes[c]
                break

    util.write_annotation(out, result)
예제 #29
0
def do_segmentation(text,
                    element,
                    out,
                    chunk,
                    segmenter,
                    existing_segments=None,
                    model=None,
                    no_pickled_model=False):
    """Segment all "chunks" (e.g. sentences) into smaller "tokens" (e.g. words),
    and annotate them as "element" (e.g. w).
    Segmentation is done by the given "segmenter"; some segmenters take
    an extra argument which is a pickled "model" object.
    """
    if model:
        if not no_pickled_model:
            with open(model, "rb") as M:
                model = pickle.load(M, encoding='UTF-8')
        segmenter_args = (model, )
    else:
        segmenter_args = ()
    assert segmenter in SEGMENTERS, "Available segmenters: %s" % ", ".join(
        sorted(SEGMENTERS))
    segmenter = SEGMENTERS[segmenter]
    segmenter = segmenter(*segmenter_args)
    assert hasattr(
        segmenter, "span_tokenize"
    ), "Segmenter needs a 'span_tokenize' method: %r" % segmenter

    corpus_text, anchor2pos, pos2anchor = util.read_corpus_text(text)

    # First we read the chunks and partition the text into spans
    # E.g., "one two <s>three four</s> five <s>six</s>"
    #   ==> ["one two ", "three four", " five ", "six"]
    #   (but using spans (pairs of anchors) instead of strings)

    positions = set()
    for c in chunk.split():
        CHUNK = util.read_annotation(c)
        positions = positions.union(
            set(anchor2pos[anchor] for edge in CHUNK
                for span in util.edgeSpans(edge) for anchor in span))
    positions = sorted(set([0, len(corpus_text)]) | positions)
    chunk_spans = list(zip(positions, positions[1:]))

    if existing_segments:
        OUT = util.read_annotation(existing_segments)
        token_spans = sorted((anchor2pos[start], anchor2pos[end])
                             for edge in OUT
                             for (start, end) in util.edgeSpans(edge))
        for n, (chunkstart, chunkend) in enumerate(chunk_spans[:]):
            for tokenstart, tokenend in token_spans:
                if tokenend <= chunkstart:
                    continue
                if tokenstart >= chunkend:
                    break
                if chunkstart != tokenstart:
                    chunk_spans.append((chunkstart, tokenstart))
                chunkstart = tokenend
                chunk_spans[n] = (chunkstart, chunkend)
        chunk_spans.sort()
        util.log.info("Reorganized into %d chunks" % len(chunk_spans))
    else:
        OUT = {}

    # Now we can segment each chunk span into tokens
    for start, end in chunk_spans:
        for spanstart, spanend in segmenter.span_tokenize(
                corpus_text[start:end]):
            spanstart += start
            spanend += start
            if corpus_text[spanstart:spanend].strip():
                span = pos2anchor[spanstart], pos2anchor[spanend]
                edge = util.mkEdge(element, span)
                OUT[edge] = None

    util.write_annotation(out, OUT)
예제 #30
0
def annotate(out_complemgrams,
             out_compwf,
             out_baseform,
             word,
             msd,
             baseform_tmp,
             saldo_comp_model,
             nst_model,
             stats_model,
             complemgramfmt=util.SCORESEP + "%.3e",
             delimiter=util.DELIM,
             compdelim=util.COMPSEP,
             affix=util.AFFIX,
             cutoff=True,
             saldo_comp_lexicon=None,
             stats_lexicon=None):
    """Divides compound words into prefix(es) and suffix.
    - out_complemgram is the resulting annotation file for compound lemgrams
      and their probabilities
    - out_compwf is the resulting annotation file for compound wordforms
    - out_baseform is the resulting annotation file for baseforms (including baseforms for compounds)
    - word and msd are existing annotations for wordforms and MSDs
    - baseform_tmp is the existing temporary annotation file for baseforms (not including compounds)
    - saldo_comp_model is the Saldo compound model
    - nst_model is the NST part of speech compound model
    - stats_model is the statistics model (pickled file)
    - complemgramfmt is a format string for how to print the complemgram and its probability
      (use empty string to omit probablility)
    - saldo_comp_lexicon, stats_lexicon: these arguments cannot be set from the command line,
      but are used in the catapult. These arguments must be last.
    """

    ##################
    # Load models
    ##################
    if not saldo_comp_lexicon:
        saldo_comp_lexicon = SaldoCompLexicon(saldo_comp_model)

    with open(nst_model, "rb") as f:
        nst_model = pickle.load(f)

    if not stats_lexicon:
        stats_lexicon = StatsLexicon(stats_model)

    WORD = util.read_annotation(word)
    MSD = util.read_annotation(msd)

    # Create alternative lexicon (for words within the file)
    altlexicon = InFileLexicon(WORD, MSD)

    ##################
    # Do annotation
    ##################
    OUT_complem = {}
    OUT_compwf = {}
    OUT_baseform = {}
    IN_baseform = util.read_annotation(baseform_tmp)

    previous_compounds = {}

    for tokid in WORD:
        key = (WORD[tokid], MSD[tokid])
        if key in previous_compounds:
            compounds = previous_compounds[key]
        else:
            compounds = compound(saldo_comp_lexicon, altlexicon, WORD[tokid],
                                 MSD[tokid])

            if compounds:
                compounds = rank_compounds(compounds, nst_model, stats_lexicon)

                if cutoff:
                    # Only keep analyses with the same length (or +1) as the most probable one
                    best_length = len(compounds[0][1])
                    i = 0
                    for c in compounds:
                        if len(c[1]) > best_length + 1 or len(
                                c[1]) < best_length:
                            break

                        i += 1
                    compounds = compounds[:i]

            previous_compounds[key] = compounds

        # Create complem and compwf annotations
        make_complem_and_compwf(OUT_complem, OUT_compwf, complemgramfmt, tokid,
                                compounds, compdelim, delimiter, affix)

        # Create new baseform annotation if necessary
        if IN_baseform[tokid] != affix:
            OUT_baseform[tokid] = IN_baseform[tokid]
        else:
            make_new_baseforms(OUT_baseform, tokid, MSD[tokid], compounds,
                               stats_lexicon, altlexicon, delimiter, affix)

    util.write_annotation(out_complemgrams, OUT_complem)
    util.write_annotation(out_compwf, OUT_compwf)
    util.write_annotation(out_baseform, OUT_baseform)