def lemmatize(lemmatizer, conllu, morphs): def clean_final(text): finals = {"פ":"ף","כ":"ך","מ":"ם","נ":"ן","צ":"ץ"} if text[-1] in finals: text = text[:-1] + finals[text[-1]] return text def post_process(word, pos, lemma, morph): if word == lemma: if word + "\t" + pos in lex: if pos == "VERB" and "Fut" in morph: lemma = lex[word + "\t" + pos] if pos == "VERB" and "Pres" in morph: lemma = lex[word + "\t" + pos] if pos == "VERB" and "Part" in morph: lemma = lex[word + "\t" + pos] if pos in ["NOUN", "ADJ"] and "Plur" in morph: lemma = lex[word + "\t" + pos] else: if "Plur" in morph and pos in ["NOUN", "ADJ"] and ( word.endswith("ים") or word.endswith("ות")): lemma = lemma[:-2] if word.endswith("ות"): lemma += "ה" lemma = clean_final(lemma) return lemma uposed = [[l.split("\t") for l in s.split("\n")] for s in conllu.strip().split("\n\n")] dicts = CoNLL.convert_conll(uposed) for sent in dicts: for tok in sent: tok["id"] = int(tok["id"][0]) doc = Document(dicts) lemmatized = lemmatizer(doc) output = [] counter = 0 for sent in lemmatized.sentences: for tok in sent.tokens: word = tok.words[0] lemma = word.lemma if lemmatizer.do_post_process: lemma = post_process(word.text, word.upos, word.lemma, morphs[counter]) row = [str(word.id), word.text, lemma, word.upos, word.xpos, '_', str(word.head), "_", "_", "_"] output.append("\t".join(row)) counter += 1 output.append("") lemmatized = "\n".join(output) lemmatized = get_col(lemmatized,2) return lemmatized
def extract_features(writer, language, corpus, sentence_list): id = 0 for sentence in sentence_list: data = {} root = get_root(sentence) # First sanity check: is there a verbal root? if root == None: continue sentence_all, sentence_open = remove_punct_particles( sentence), remove_closed_class(sentence) # Convert back to stanza for later tree creation (lazy) try: document_all = stanza.Document(CoNLL.convert_conll([sentence_all])) document_open = stanza.Document( CoNLL.convert_conll([sentence_open])) except: print("WARNING: Could not parse {0}".format(id)) continue try: dependency_tree_all = tree(document_all.sentences[0].dependencies) dependency_tree_open = tree( document_open.sentences[0].dependencies) except: print("WARNING: Could not create tree for {0}".format(id)) continue # Second sanity check: can we make a tree? if len(dependency_tree_all) == 0 or len(dependency_tree_open) == 0: print(root) text = [] for tok in sentence: text.append(tok[1]) text.append(tok[7]) print(text) print("WARNING: Dependencies empty! (sentence {0})".format(id)) id += 1 continue # Third sanity check: does it meet order_info requirements? root = get_root(sentence_all) # Retrieve new verb index order_info = determine_order_from_constituents(root, sentence_all) if (order_info == None): continue data.update({ "language": language, "corpus": corpus, "id": "{0}_{1}".format(corpus, id), "original_length": len(sentence) }) data.update(order_info) data.update(head_final(sentence_all, sentence_open)) observed_data = data observed_data.update({"baseline": "observed"}) observed_data.update(get_dep_length(sentence_all, sentence_open)) optimal_data = data optimal_data.update({"baseline": "optimal"}) optimal_data.update( get_optimal_dep_length(dependency_tree_all, dependency_tree_open)) writer.writerow(observed_data) writer.writerow(optimal_data) #print(observed_data) for i in range(0, 10): random_data = data random_data.update({"baseline": "random"}) random_data.update( get_random_dep_lengths(dependency_tree_all, dependency_tree_open)) writer.writerow(random_data) #print(random_data) id += 1
def __init__(self, gold, pred, verbose=False, group=False): """ Align golden and predicted tokens, and their tags. Create dictionaries of falsely predicted tags :param gold: the gold conllu file :param pred: the predicted conlly file :param verbose: if true print information about token numbers :param group: if true, put falsely predicted ufeats labels into a dictionary that contains all the labels it was falsely assigned and the number of times each predicted label was found """ gold = C.load_conll(open(gold, 'r', encoding='utf8')) gold_dic = C.convert_conll(gold) # returns a dictionary with all the column names gold_doc = Document(gold_dic) pred = C.load_conll(open(pred, 'r', encoding='utf8')) pred_dic = C.convert_conll(pred) # returns a dictionary with all the column names pred_doc = Document(pred_dic) # get the tokens self.gold_tokens = [j['text'] for i in gold_dic for j in i] self.pred_tokens = [j['text'] for i in pred_dic for j in i] # get upos tags gold_tags = [j['upos'] for i in gold_dic for j in i] pred_tags = [j['upos'] for i in pred_dic for j in i] # get xpos tags gold_xpos = [j['xpos'] for i in gold_dic for j in i] pred_xpos = [j['xpos'] for i in pred_dic for j in i] # get ufeats tag gold_feats = list() pred_feats = list() for i in gold_dic: for j in i: if 'feats' in j: gold_feats.append(j['feats']) else: gold_feats.append('_') for i in pred_dic: for j in i: if 'feats' in j: pred_feats.append(j['feats']) else: pred_feats.append('_') if verbose: print('Number of gold tokens:', len(self.gold_tokens), ', number of predicted tokens:', len(self.pred_tokens)) # align gold and predicted tokens cost, a2b, b2a, a2b_multi, b2a_multi = align(self.gold_tokens, self.pred_tokens) # align tokens and their POS tags separately self.aligned = list() # tokens self.aligned_pos = list() # upos self.aligned_feats = list() self.aligned_xpos = list() for i in range(len(b2a)): t = (self.gold_tokens[b2a[i]], self.pred_tokens[i]) self.aligned.append(t) p = (gold_tags[b2a[i]], pred_tags[i]) self.aligned_pos.append(p) f = (gold_feats[b2a[i]], pred_feats[i]) self.aligned_feats.append(f) x = (gold_xpos[b2a[i]], pred_xpos[i]) self.aligned_xpos.append(x) # align predicted tags to golden tags, not vice versa as before gold_aligned = list() for i in range(len(a2b)): t = (self.gold_tokens[i], self.pred_tokens[a2b[i]]) gold_aligned.append(t) overall = list() for (a, b) in self.aligned: if a == b: overall.append((a, b)) if verbose: print('Aligned tokens. GOLD:', len(gold_aligned), 'PREDICTED:', len(self.aligned), 'ALIGNED:', len(overall)) self.conf_tags = {} # falsely predicted upos tags self.conf_tags_all = {} # all upos tags self.incorrect_upos = 0 # number of incorrectly predicted upos tags # how many times different tags cooccured in gold and pred files i = 0 for (a, b) in self.aligned_pos: if a != b: self.incorrect_upos += 1 if (a, b) not in self.conf_tags: self.conf_tags[(a, b)] = 1 else: self.conf_tags[(a, b)] += 1 if (a, b) not in self.conf_tags_all: self.conf_tags_all[(a, b)] = 1 else: self.conf_tags_all[(a, b)] += 1 i += 1 self.conf_feats = {} self.conf_feats_all = {} self.incorrect_feats = 0 i = 0 for (a, b) in self.aligned_feats: a = "|".join(sorted(feat for feat in a.split("|") if feat.split("=", 1)[0] in UNIVERSAL_FEATURES)) b = "|".join(sorted(feat for feat in b.split("|") if feat.split("=", 1)[0] in UNIVERSAL_FEATURES)) if a != b: self.incorrect_feats += 1 # create a dictionary for each falsely predicted ufeats labels and group all its false predictions if group: if a not in self.conf_feats: self.conf_feats[a] = dict() self.conf_feats[a][b] = 1 else: if b not in self.conf_feats[a]: self.conf_feats[a][b] = 1 else: self.conf_feats[a][b] += 1 else: if (a, b) not in self.conf_feats: self.conf_feats[(a, b)] = 1 else: self.conf_feats[(a, b)] += 1 if (a, b) not in self.conf_feats_all: self.conf_feats_all[(a, b)] = 1 else: self.conf_feats_all[(a, b)] += 1 i += 1 self.conf_xpos = {} self.incorrect_xpos = 0 i = 0 for (a, b) in self.aligned_xpos: if a != b: self.incorrect_xpos += 1 if (a, b) not in self.conf_xpos: self.conf_xpos[(a, b)] = 1 else: self.conf_xpos[(a, b)] += 1 i += 1
def gen_conll_sens(stream, swaps=()): for sen in gen_tsv_sens(stream, swaps): dic = CoNLL.convert_conll([sen]) yield StanzaDocument(dic).sentences[0]
def test_conll_to_dict(): dicts = CoNLL.convert_conll(CONLL) assert dicts == DICT