def read_treebank(tb, max_size=None, shuffle=False, lowercase=True, skip_multi=True, pos_filter={'NOUN', 'VERB', 'ADJ', 'ADV'}): d = set() n_sent, n_node = 0, 0 m = lang_re.match(tb) lang = m.group(1) for tbf in glob.glob(tb + '/*.conllu'): for sent in conllu_sentences(tbf): n_sent += 1 for node in sent.nodes: if not (node.form and node.lemma) \ or sent.get_multi(node) \ or node.upos not in pos_filter: continue n_node += 1 feats = [node.upos] if node.feats is not None: feats += node.feats.split("|") feats = tuple(feats) if lowercase: d.add((to_lower(node.form, lang), to_lower(node.lemma, lang), feats)) else: d.add((node.form, node.lemma, feats)) d = list(d) if shuffle: random.shuffle(d) if max_size: d = d[:max_size] return d
#!/usr/bin/env python3 import sys, argparse, re from udtools.conllu import conllu_sentences ap = argparse.ArgumentParser() ap.add_argument('input_file') args = ap.parse_args() tb = conllu_sentences(args.input_file) for sent_num, sent in enumerate(tb): for i, node in enumerate(sent.nodes[1:]): if node.upos == 'PRON' \ and node.get_feat('PronType') is None\ and node.lemma in {'ben', 'sen', 'biz', 'siz', 'hepimiz', 'herkes', 'kimse', 'bizler', 'sizler', 'onlar', 'hiçbirimiz', 'hiçbiriniz', 'bazılarınız', 'bazılarımız', 'kiminiz', 'bazılarımız', 'kendi'}: node.add_feat('PronType', 'Prs') elif node.upos == 'PRON' \ and node.get_feat('PronType') is None\ and node.lemma in {'o', 'şu', 'bu', 'bura', 'şura', 'ora'}: node.add_feat('PronType', 'Dem') elif node.upos == 'DET' \ and node.get_feat('PronType') is None\ and node.get_feat('Definite'): node.add_feat('PronType', 'Art')