def makeDoc(doc): s = str(type(doc)) if s.find("spacy") == 8: return doc elif s.find("stanza") == 8: from stanza.utils.conll import CoNLL d = CoNLL.conll_as_string(CoNLL.convert_dict(doc.to_dict())) elif s.find("classla") == 8 or s.find("stanfordnlp") == 8: d = doc.conll_file.conll_as_string() elif s.find("nltk") == 8: d = doc.to_conll(10) elif s.find("combo") == 8: from combo.data import sentence2conllu d = sentence2conllu(doc, False).serialize() elif s.find("list") == 8: d = "".join("".join(str(t) + "\n" for t in s) + "\n" for s in doc) else: d = str(doc) DOC = [] m = [] misc = "" for t in d.split("\n"): x = t.split("\t") if len(x) != 10: continue try: i, j = int(x[0]), int(x[6]) except: try: i = x[0].index("-") j = int(x[0][0:i]) k = int(x[0][i + 1:]) m.append((len(DOC), j, k, x[1])) continue except: continue s = type("", (object, ), {"i": i}) s.orth_ = x[1] s.pos_ = x[3] s.head = j s.dep_ = x[7] s.whitespace_ = (x[9].find("SpaceAfter=No") < 0) if s.whitespace_: i = x[9].find("start_char=") if i >= 0: j = x[9].find("|", i) k = x[9][i + 5:] if j < 0 else x[9][i + 5:j] if misc.find("end" + k) >= 0: DOC[-1].whitespace_ = False DOC.append(s) misc = x[9] for i, j, k, f in reversed(m): offset = i - DOC[i].i DOC[k + offset].contract = (f, [i + offset for i in range(j, k + 1)]) for i, t in enumerate(DOC): if t.head == 0: t.head = t else: t.head = DOC[i + t.head - t.i] return DOC
def __init__(self,UniDic,UDPipe): self.UniDic=UniDic if UniDic!=None: d=os.path.join(DOWNLOAD_DIR,UniDic) r=os.path.join(PACKAGE_DIR,"mecabrc") if os.path.isdir(d): try: from MeCab import Tagger except: from fugashi import GenericTagger as Tagger self.mecab=Tagger("-r "+r+" -d "+d).parse elif UniDic=="unidic-lite": try: from MeCab import Tagger except: from fugashi import GenericTagger as Tagger import unidic_lite self.mecab=Tagger("-r "+r+" -d "+unidic_lite.DICDIR).parse elif UniDic=="ipadic": try: from MeCab import Tagger except: from fugashi import GenericTagger as Tagger try: import ipadic self.mecab=Tagger(ipadic.MECAB_ARGS).parse except: self.mecab=Tagger().parse else: d={ "gendai":"dic1", "spoken":"dic2", "qkana":"dic3", "kindai":"dic4", "kinsei":"dic5", "kyogen":"dic6", "wakan":"dic7", "wabun":"dic8", "manyo":"dic9" } self.dictkey=d[UniDic] self.mecab=self.ChamameWebAPI self.udpipe=self.UDPipeWebAPI if UDPipe==None: self.model="japanese-gsd" else: self.model=UDPipe m=os.path.join(DOWNLOAD_DIR,self.model+".udpipe") if os.path.isfile(m): import ufal.udpipe self.model=ufal.udpipe.Model.load(m) if UniDic==None: self.udpipe=ufal.udpipe.Pipeline(self.model,"tokenizer=presegmented","","","").process else: self.udpipe=ufal.udpipe.Pipeline(self.model,"conllu","none","","").process elif self.model.startswith("stanza_"): import stanza if UniDic==None: self.model=stanza.Pipeline(self.model[7:],verbose=False) from stanza.utils.conll import CoNLL self.udpipe=lambda text:CoNLL.conll_as_string(CoNLL.convert_dict(self.model(text).to_dict())) else: self.model=stanza.Pipeline(self.model[7:],processors="depparse",depparse_pretagged=True,verbose=False) self.udpipe=self.StanzaAPI
def StanzaAPI(self,conllu): d=[] e=[] for s in conllu.split("\n"): if s=="" or s.startswith("#"): if e!=[]: d.append(list(e)) e=[] else: t=s.split("\t") e.append({"id":int(t[0]),"text":t[1],"lemma":t[2],"upos":t[3],"xpos":t[4],"misc":t[9]}) from stanza.models.common.doc import Document from stanza.utils.conll import CoNLL return CoNLL.conll_as_string(CoNLL.convert_dict(self.model(Document(d)).to_dict()))
def main(args): sents = load_conllu(args.input) stanza_nlp = stanza.Pipeline( lang=args.lg, dir=args.model, package=args.treebank, tokenize_no_ssplit=True, use_gpu=args.cuda, ) doc = stanza_nlp(sents) doc_dict = doc.to_dict() conll = CoNLL.convert_dict(doc_dict) doc_conll_str = CoNLL.conll_as_string(conll) with open(args.output, "w") as wf: wf.write(doc_conll_str)
def get_depd_tree( doc: str, lg: str, stanza_model_path: Path, tokenize: bool = True, ssplit: bool = False, cuda: bool = False, verbose: bool = False, ) -> str: logging.info(f"generating SUD parse for the input document") model_dir = str(stanza_model_path) if tokenize and ssplit: stanza_nlp = stanza.Pipeline(lang=lg, dir=model_dir, use_gpu=cuda, verbose=verbose) elif tokenize: stanza_nlp = stanza.Pipeline( lang=lg, dir=model_dir, tokenize_no_ssplit=True, use_gpu=cuda, verbose=verbose, ) else: stanza_nlp = stanza.Pipeline( lang=lg, dir=model_dir, tokenize_pretokenized=True, use_gpu=cuda, verbose=verbose, ) stanza_doc = stanza_nlp(doc) doc_dict = stanza_doc.to_dict() conll = CoNLL.convert_dict(doc_dict) doc_conll_str = CoNLL.conll_as_string(conll) return doc_conll_str
def predict(self, eval_file_or_string): eval_file = _read_conllu_arg(eval_file_or_string, self.feature_config, predict=True) doc = Document(CoNLL.conll2dict(input_file=eval_file)) batch = DataLoader(doc, self.batch_size, self.loaded_args, self.pretrain, vocab=self.vocab, evaluation=True, sort_during_eval=True) preds = [] if len(batch) > 0: for i, b in enumerate(batch): preds += self.trainer.predict(b) preds = utils.unsort(preds, batch.data_orig_idx) batch.doc.set([HEAD, DEPREL], [y for x in preds for y in x]) doc_conll = CoNLL.convert_dict(batch.doc.to_dict()) conll_string = CoNLL.conll_as_string(doc_conll) return conll_string
def serve(doc, port=5000, RtoL=False): s = str(type(doc)) if s.find("spacy") == 8: c = "" for t in doc: try: m = str(t.morph) if m.startswith("<spacy"): m = "" except: m = "" c += str(t.i + 1) for i in [ t.orth_, t.lemma_, t.pos_, t.tag_, m, str(0 if t.head == t else t.head.i + 1), t.dep_, "" ]: c += "\t_" if i.strip() == "" else "\t" + i if t.ent_iob_ == "B" or t.ent_iob_ == "I": u = "NE=" + t.ent_iob_ + "-" + t.ent_type_ else: u = "" if RtoL and len(t.orth_) > 1: if len([c for c in t.orth_ if ord(c) > 12287]) > 0: u += ("" if u == "" else "|") + "Direction=RtoL" if not t.whitespace_: u += ("" if u == "" else "|") + "SpaceAfter=No" if t.norm_ != "" and t.norm_ != t.orth_: u += ("" if u == "" else "|") + "Translit=" + t.norm_ if u == "": u = "_" c += "\t" + u + "\n" elif s.find("stanza") == 8: from stanza.utils.conll import CoNLL c = CoNLL.conll_as_string(CoNLL.convert_dict(doc.to_dict())) elif s.find("classla") == 8 or s.find("stanfordnlp") == 8: c = doc.conll_file.conll_as_string() elif s.find("nltk") == 8: c = doc.to_conll(10) elif s.find("combo") == 8: from combo.data import sentence2conllu c = sentence2conllu(doc, False).serialize() elif s.find("list") == 8: c = "".join("".join(str(t) + "\n" for t in s) + "\n" for s in doc) else: c = str(doc) if port == None: from IPython.display import IFrame, display from urllib.parse import quote if RtoL: display( IFrame(src=EDITOR_RTOL + "#" + quote(c), width="100%", height="400")) else: display( IFrame(src=EDITOR_URL + "#" + quote(c), width="100%", height="400")) return import sys from http.server import HTTPServer f = TEMPFILE f.seek(0) f.truncate(0) f.write(c.encode("utf-8")) if RtoL: httpd = HTTPServer(("", port), DeplacyRequestHandlerRtoL) else: httpd = HTTPServer(("", port), DeplacyRequestHandler) print("http://127.0.0.1:" + str(port) + " " + VERSION, file=sys.stderr) try: httpd.serve_forever() except: return
def generate_templates(fname, stanza_lang, rtl=False, min_support=2, strict=True, case_folding=False, remove_punct=False, temp_fname="gen_templates.txt", sent_fname="gen_sentences.txt", remove_diacritics=True, dot_fix=False, join_char=' ', idf_file=None): def record(key): problems[key] += 1 def print_report(templates): print("{} templates".format(len(templates))) print("{} impossible questions".format(problems['impossible'])) print( "{} possible questions of which {} share the root with the original sentence" .format(problems['possible'], problems['same_root'])) print("{} copula questions".format(problems['copula'])) print("{} have no question templates".format( problems['no_q_template'])) print("{} have no answer templates".format(problems['no_a_template'])) problems = { 'possible': 0, 'impossible': 0, 'same_root': 0, 'copula': 0, 'no_q_template': 0, 'no_a_template': 0 } templates = {} with open(fname) as f: for line in tqdm.tqdm(f): if line.strip(): question, answer, base_sentence = line.split(" #|@ ") if remove_diacritics: question = remove_unicode_diacritics(question) base_sentence = remove_unicode_diacritics(base_sentence) answer = remove_unicode_diacritics(answer) if case_folding: question, answer, base_sentence = question.lower( ), answer.lower(), base_sentence.lower() # lowercasing is a necessary step to mitigate parser's errors if remove_punct: question = remove_unicode_punctuation(question) base_sentence = remove_unicode_punctuation(base_sentence) answer = remove_unicode_punctuation(answer) question, base_sentence, answer = question.strip( ), base_sentence.strip(), answer.strip() if dot_fix: if not is_punctuation(question[-1]): question += "?" if not is_punctuation(base_sentence[-1]): base_sentence += "." # have to proceed through files, because C++ package works with files with open('sentence.conll', 'w') as f1: ss = stanza_lang(base_sentence) conll_list = CoNLL.convert_dict(ss.to_dict()) sentence_tokenized = [ w.text for s in ss.sentences for w in s.words ] f1.write(CoNLL.conll_as_string(conll_list)) with open('question.conll', 'w') as f1: qq = stanza_lang(question) conll_list = CoNLL.convert_dict(qq.to_dict()) question_tokenized = [ w.text for s in qq.sentences for w in s.words ] f1.write(CoNLL.conll_as_string(conll_list)) ud_s = udon2.ConllReader.read_file('sentence.conll')[0] ud_q = udon2.ConllReader.read_file('question.conll')[0] # s_roots = udon2.ConllReader.read_file('sentence.conll') # q_roots = udon2.ConllReader.read_file('question.conll') # ud_s = s_roots[0] # ud_q = q_roots[0] s_root_word = ud_s.children[0] q_root_word = ud_q.children[0] if strict: diff = get_difference(question_tokenized, sentence_tokenized) cond = not diff else: same = get_intersection(question_tokenized, sentence_tokenized) cond = len(same) > 0 if cond: # means there's a direct dependency tree transformation! record('possible') if s_root_word.form.lower() == q_root_word.form.lower(): # many questions that can be asked share the root with a sentence record('same_root') elif q_root_word.prop_exists("deprel", "cop"): # means this is a copula question record('copula') q_temp = generate_question_template(s_root_word, q_root_word, strict=strict, join_char=join_char) to_check = q_temp[:-1] if rtl else q_temp[1:] S_t = sum([type(x) == TemplateElement for x in to_check]) S_nt = len(to_check) - S_t if not q_temp: record('no_q_template') continue if S_t == 0: continue qw = q_temp[-1] if rtl else q_temp[0] if type(qw) == TemplateElement: # the first word is not a constant, so no question word there continue if rtl: qw = q_temp.pop() q_temp.append('<qw>') else: qw = q_temp.pop(0) q_temp.insert(0, '<qw>') a_temp = generate_answer_template(s_root_word, answer, join_char=join_char) if not a_temp: record('no_a_template') continue q_temp, a_temp = normalize_templates(q_temp, a_temp) qtemp_without_qw = join_char.join(map(str, q_temp)) if qtemp_without_qw not in templates: templates[qtemp_without_qw] = { 'question': q_temp, 'all_templates': S_nt == 0, 'answer': a_temp, 'qw': {} } assert templates[qtemp_without_qw]['all_templates'] == ( S_nt == 0), "Inconsistency in templates found" if qw not in templates[qtemp_without_qw]['qw']: templates[qtemp_without_qw]['qw'][qw] = {} atemp_str = join_char.join(map(str, a_temp)) if atemp_str not in templates[qtemp_without_qw]['qw'][qw]: templates[qtemp_without_qw]['qw'][qw][atemp_str] = { 'answer': a_temp, 'examples': [] } templates[qtemp_without_qw]['qw'][qw][atemp_str][ 'examples'].append({ 'sentence': base_sentence.strip(), 'question': question.strip(), 'answer': answer.strip(), 'node': s_root_word.copy( ), # If not copying, then we'll have a memory error, since the associated TreeList will be freed }) # templates[f"{s_root_word.upos} #|@ {str(s_root_word.feats)} #|@ {s_root_word.child_has_prop('deprel', 'aux')} #|@ {non_temp_el} #|@ {q_temp}"][qw][a_temp].add( # base_sentence.strip() + " | " + question.strip() + " | " + answer.strip()) elif strict: record('impossible') idf = load_idf(idf_file) if idf_file else None final_templates, temp_id = [], 1 temp_base = os.path.splitext(os.path.basename(temp_fname))[0] with open(temp_fname, "w") as f, open(sent_fname, 'w') as f1: for _, passport in templates.items(): N_ex = sum([ len(data['examples']) for _, endings in passport['qw'].items() for _, data in endings.items() ]) q_tmpl = join_char.join(map(str, passport['question'])) if passport['all_templates'] or N_ex >= min_support: idfs = [ idf.get(t, float('inf')) for t in passport['question'] if type(t) == str and t != '<qw>' ] max_idf = max(idfs) if idfs else 0 if max_idf <= math.log( 4): # appeared in at least 25% of the documents for qw, endings in passport['qw'].items(): for a_tmpl, data in endings.items(): logging.debug( "-- {} - {} - {} -> PASSED --".format( q_tmpl, passport['all_templates'], N_ex)) final_templates.append({ 'question': q_tmpl.replace("<qw>", qw), 'answer': a_tmpl, 'props': [{ 'pos': x['node'].upos, 'has_aux': x['node'].child_has_prop('deprel', 'aux'), 'feats': x['node'].feats } for x in data['examples']] }) sent = "\n".join([ " | ".join([ x['sentence'], x['question'], x['answer'] ]) for x in data['examples'] ]) tmpl = "{} => {}".format( q_tmpl.replace('<qw>', qw), a_tmpl) f.write("{}\n".format(tmpl)) f1.write("id: {}{}\n{}\n\n".format( temp_base, temp_id, sent)) temp_id += 1 else: logging.debug( "-- {} - {} - {} -> FAILED IDF ({}) --".format( q_tmpl, passport['all_templates'], N_ex, max_idf)) else: logging.debug("-- {} - {} - {} -> FAILED --".format( q_tmpl, passport['all_templates'], N_ex)) print_report(final_templates) return final_templates, temp_fname
if args.max_examples > 0: # sample one gold question and answer q_dict_keys = list(q_dict.keys()) ind = np.random.choice(range(len(q_dict_keys))) gold_q = q_dict_keys[ind] ind_a = np.random.choice(range(len(q_dict[gold_q]))) gold_a = q_dict[gold_q][ind_a] total += len(q_dict) sent = re.sub(r' {2,}', '', sent) stanza_sent = stanza_dep_pipe(sent) with open(fname, 'w') as f: conll_list = CoNLL.convert_dict(stanza_sent.to_dict()) f.write(CoNLL.conll_as_string(conll_list)) trees = udon2.ConllReader.read_file(fname) res = overgenerate_questions(trees, guards_root, templates, template_examples, return_first=False) if res: idx_sorted_by_scores, qwf, atf, scores = rank( res, stanza_pipe, stanza_dep_pipe, qw_stat, a_tmpl,
def write_doc_to_file(doc, out_file): conll_string = CoNLL.conll_as_string(CoNLL.convert_dict(doc.to_dict())) with open(str(out_file), "w") as fp: fp.write(conll_string)
def test_conllu(processed_doc): assert CoNLL.conll_as_string(CoNLL.convert_dict( processed_doc.to_dict())) == EN_DOC_CONLLU_GOLD
from stanza.utils.conll import CoNLL # stanza spanish tagging nlp = stanza.Pipeline(lang='es') # The input to this is arbitrary, it could be a file if you wanted. doc = nlp("Yo soy Diego. Soy de Puerto Rico.") # Convert to conll format stanza_conll = CoNLL.convert_dict(doc.to_dict()) # Write to conll format file - we could write multiple files for multiple # different input sources here with open('conll.txt', 'w+') as f: f.write(CoNLL.conll_as_string(stanza_conll)) # The columns we want (maybe we can get more info, I'm not sure) COLUMN_TYPES = ( 'ignore', 'words', 'ignore', 'pos', 'ignore', 'ignore', 'ignore', 'ignore', 'ignore', 'ignore', )
def to_conllu(doc, RtoL=False): s = str(type(doc)) if s.find("spacy") == 8: c = "" for s in doc.sents: for t in s: try: m = str(t.morph) if m.startswith("<spacy"): m = "" except: m = "" c += str(t.i - s.start + 1) for i in [ t.orth_, t.lemma_, t.pos_, t.tag_, m, str(0 if t.head == t else t.head.i - s.start + 1), t.dep_, "" ]: c += "\t_" if i.strip() == "" else "\t" + i if t.ent_iob_ == "B" or t.ent_iob_ == "I": u = "NE=" + t.ent_iob_ + "-" + t.ent_type_ else: u = "" if RtoL and len(t.orth_) > 1: if len([c for c in t.orth_ if ord(c) > 12287]) > 0: u = "Direction=RtoL" if u == "" else "Direction=RtoL|" + u if not t.whitespace_: u += ("" if u == "" else "|") + "SpaceAfter=No" if t.norm_ != "" and t.norm_ != t.orth_: u += ("" if u == "" else "|") + "Translit=" + t.norm_ if u == "": u = "_" c += "\t" + u + "\n" c += "\n" return c elif s.find("stanza") == 8: from stanza.utils.conll import CoNLL return CoNLL.conll_as_string(CoNLL.convert_dict(doc.to_dict())) elif s.find("classla") == 8: return doc.to_conll() elif s.find("stanfordnlp") == 8: return doc.conll_file.conll_as_string() elif s.find("nltk") == 8: return doc.to_conll(10) elif s.find("combo") == 8: from combo.data import sentence2conllu return sentence2conllu(doc, False).serialize() elif s.find("supar") == 8: if hasattr(doc, "sentences"): return "".join([str(s) + "\n" for s in doc.sentences]) else: return str(doc) + "\n" elif s.find("list") == 8: return "".join("".join(str(t) + "\n" for t in s) + "\n" for s in doc) elif s.find("dict") == 8 and "sentences" in doc: from trankit.utils.conll import CoNLL d = [] for s in doc["sentences"]: e = [] for t in s["tokens"]: if "span" in t: i, j = t["span"] t["misc"] = "start_char=" + str(i) + "|end_char=" + str(j) e.append(t) if "expanded" in t: e.extend(t["expanded"]) d.append(list(e)) return CoNLL.conll_as_string(CoNLL.convert_dict(d)) return str(doc)