Пример #1
0
def lemmatize(lemmatizer, conllu, morphs):
    def clean_final(text):
        finals = {"פ":"ף","כ":"ך","מ":"ם","נ":"ן","צ":"ץ"}
        if text[-1] in finals:
            text = text[:-1] + finals[text[-1]]
        return text

    def post_process(word, pos, lemma, morph):
        if word == lemma:
            if word + "\t" + pos in lex:
                if pos == "VERB" and "Fut" in morph:
                    lemma = lex[word + "\t" + pos]
                if pos == "VERB" and "Pres" in morph:
                    lemma = lex[word + "\t" + pos]
                if pos == "VERB" and "Part" in morph:
                    lemma = lex[word + "\t" + pos]
                if pos in ["NOUN", "ADJ"] and "Plur" in morph:
                    lemma = lex[word + "\t" + pos]
            else:
                if "Plur" in morph and pos in ["NOUN", "ADJ"] and (
                        word.endswith("ים") or word.endswith("ות")):
                    lemma = lemma[:-2]
                    if word.endswith("ות"):
                        lemma += "ה"
                    lemma = clean_final(lemma)
        return lemma

    uposed = [[l.split("\t") for l in s.split("\n")] for s in conllu.strip().split("\n\n")]
    dicts = CoNLL.convert_conll(uposed)
    for sent in dicts:
        for tok in sent:
            tok["id"] = int(tok["id"][0])
    doc = Document(dicts)
    lemmatized = lemmatizer(doc)
    output = []
    counter = 0
    for sent in lemmatized.sentences:
        for tok in sent.tokens:
            word = tok.words[0]
            lemma = word.lemma
            if lemmatizer.do_post_process:
                lemma = post_process(word.text, word.upos, word.lemma, morphs[counter])
            row = [str(word.id), word.text, lemma, word.upos, word.xpos, '_', str(word.head), "_", "_", "_"]
            output.append("\t".join(row))
            counter += 1
        output.append("")
    lemmatized = "\n".join(output)
    lemmatized = get_col(lemmatized,2)

    return lemmatized
def extract_features(writer, language, corpus, sentence_list):

    id = 0

    for sentence in sentence_list:

        data = {}
        root = get_root(sentence)

        # First sanity check: is there a verbal root?
        if root == None:
            continue

        sentence_all, sentence_open = remove_punct_particles(
            sentence), remove_closed_class(sentence)

        # Convert back to stanza for later tree creation (lazy)
        try:
            document_all = stanza.Document(CoNLL.convert_conll([sentence_all]))
            document_open = stanza.Document(
                CoNLL.convert_conll([sentence_open]))
        except:
            print("WARNING: Could not parse {0}".format(id))
            continue

        try:
            dependency_tree_all = tree(document_all.sentences[0].dependencies)
            dependency_tree_open = tree(
                document_open.sentences[0].dependencies)
        except:
            print("WARNING: Could not create tree for {0}".format(id))
            continue

        # Second sanity check: can we make a tree?
        if len(dependency_tree_all) == 0 or len(dependency_tree_open) == 0:
            print(root)
            text = []
            for tok in sentence:
                text.append(tok[1])
                text.append(tok[7])
            print(text)
            print("WARNING: Dependencies empty! (sentence {0})".format(id))
            id += 1
            continue

        # Third sanity check: does it meet order_info requirements?
        root = get_root(sentence_all)  # Retrieve new verb index
        order_info = determine_order_from_constituents(root, sentence_all)
        if (order_info == None):
            continue

        data.update({
            "language": language,
            "corpus": corpus,
            "id": "{0}_{1}".format(corpus, id),
            "original_length": len(sentence)
        })
        data.update(order_info)
        data.update(head_final(sentence_all, sentence_open))

        observed_data = data
        observed_data.update({"baseline": "observed"})
        observed_data.update(get_dep_length(sentence_all, sentence_open))

        optimal_data = data
        optimal_data.update({"baseline": "optimal"})
        optimal_data.update(
            get_optimal_dep_length(dependency_tree_all, dependency_tree_open))

        writer.writerow(observed_data)
        writer.writerow(optimal_data)
        #print(observed_data)

        for i in range(0, 10):
            random_data = data
            random_data.update({"baseline": "random"})
            random_data.update(
                get_random_dep_lengths(dependency_tree_all,
                                       dependency_tree_open))

            writer.writerow(random_data)
            #print(random_data)

        id += 1
Пример #3
0
    def __init__(self, gold, pred, verbose=False, group=False):
        """
        Align golden and predicted tokens, and their tags. Create dictionaries of falsely predicted tags
        :param gold:  the gold conllu file
        :param pred: the predicted conlly file
        :param verbose: if true print information about token numbers
        :param group: if true, put falsely predicted ufeats labels into a dictionary that contains all the labels it was
        falsely assigned and the number of times each predicted label was found
        """

        gold = C.load_conll(open(gold,
                                 'r', encoding='utf8'))
        gold_dic = C.convert_conll(gold)  # returns a dictionary with all the column names
        gold_doc = Document(gold_dic)

        pred = C.load_conll(open(pred, 'r', encoding='utf8'))
        pred_dic = C.convert_conll(pred)  # returns a dictionary with all the column names
        pred_doc = Document(pred_dic)

        # get the tokens
        self.gold_tokens = [j['text'] for i in gold_dic for j in i]
        self.pred_tokens = [j['text'] for i in pred_dic for j in i]

        # get upos tags
        gold_tags = [j['upos'] for i in gold_dic for j in i]
        pred_tags = [j['upos'] for i in pred_dic for j in i]

        # get xpos tags
        gold_xpos = [j['xpos'] for i in gold_dic for j in i]
        pred_xpos = [j['xpos'] for i in pred_dic for j in i]

        # get ufeats tag
        gold_feats = list()
        pred_feats = list()
        for i in gold_dic:
            for j in i:
                if 'feats' in j:
                    gold_feats.append(j['feats'])
                else:
                    gold_feats.append('_')
        for i in pred_dic:
            for j in i:
                if 'feats' in j:
                    pred_feats.append(j['feats'])
                else:
                    pred_feats.append('_')

        if verbose:
            print('Number of gold tokens:', len(self.gold_tokens), ', number of predicted tokens:', len(self.pred_tokens))

        # align gold and predicted tokens
        cost, a2b, b2a, a2b_multi, b2a_multi = align(self.gold_tokens, self.pred_tokens)

        # align tokens and their POS tags separately
        self.aligned = list()  # tokens
        self.aligned_pos = list()  # upos
        self.aligned_feats = list()
        self.aligned_xpos = list()
        for i in range(len(b2a)):
            t = (self.gold_tokens[b2a[i]], self.pred_tokens[i])
            self.aligned.append(t)
            p = (gold_tags[b2a[i]], pred_tags[i])
            self.aligned_pos.append(p)
            f = (gold_feats[b2a[i]], pred_feats[i])
            self.aligned_feats.append(f)
            x = (gold_xpos[b2a[i]], pred_xpos[i])
            self.aligned_xpos.append(x)

        # align predicted tags to golden tags, not vice versa as before
        gold_aligned = list()
        for i in range(len(a2b)):
            t = (self.gold_tokens[i], self.pred_tokens[a2b[i]])
            gold_aligned.append(t)

        overall = list()
        for (a, b) in self.aligned:
            if a == b:
                overall.append((a, b))
        if verbose:
            print('Aligned tokens. GOLD:', len(gold_aligned), 'PREDICTED:', len(self.aligned), 'ALIGNED:', len(overall))

        self.conf_tags = {} # falsely predicted upos tags
        self.conf_tags_all = {}  # all upos tags
        self.incorrect_upos = 0  # number of incorrectly predicted upos tags
        # how many times different tags cooccured in gold and pred files
        i = 0
        for (a, b) in self.aligned_pos:
            if a != b:
                self.incorrect_upos += 1
                if (a, b) not in self.conf_tags:
                    self.conf_tags[(a, b)] = 1
                else:
                    self.conf_tags[(a, b)] += 1
            if (a, b) not in self.conf_tags_all:
                self.conf_tags_all[(a, b)] = 1
            else:
                self.conf_tags_all[(a, b)] += 1
            i += 1

        self.conf_feats = {}
        self.conf_feats_all = {}
        self.incorrect_feats = 0
        i = 0
        for (a, b) in self.aligned_feats:
            a = "|".join(sorted(feat for feat in a.split("|")
                                if feat.split("=", 1)[0] in UNIVERSAL_FEATURES))
            b = "|".join(sorted(feat for feat in b.split("|")
                                if feat.split("=", 1)[0] in UNIVERSAL_FEATURES))
            if a != b:
                self.incorrect_feats += 1
                # create a dictionary for each falsely predicted ufeats labels and group all its false predictions
                if group:
                    if a not in self.conf_feats:
                        self.conf_feats[a] = dict()
                        self.conf_feats[a][b] = 1
                    else:
                        if b not in self.conf_feats[a]:
                            self.conf_feats[a][b] = 1
                        else:
                            self.conf_feats[a][b] += 1
                else:
                    if (a, b) not in self.conf_feats:
                        self.conf_feats[(a, b)] = 1
                    else:
                        self.conf_feats[(a, b)] += 1
            if (a, b) not in self.conf_feats_all:
                self.conf_feats_all[(a, b)] = 1
            else:
                self.conf_feats_all[(a, b)] += 1
            i += 1

        self.conf_xpos = {}
        self.incorrect_xpos = 0
        i = 0
        for (a, b) in self.aligned_xpos:
            if a != b:
                self.incorrect_xpos += 1
                if (a, b) not in self.conf_xpos:
                    self.conf_xpos[(a, b)] = 1
                else:
                    self.conf_xpos[(a, b)] += 1
            i += 1
Пример #4
0
def gen_conll_sens(stream, swaps=()):
    for sen in gen_tsv_sens(stream, swaps):
        dic = CoNLL.convert_conll([sen])
        yield StanzaDocument(dic).sentences[0]
Пример #5
0
def test_conll_to_dict():
    dicts = CoNLL.convert_conll(CONLL)
    assert dicts == DICT