def generate_templates(fname, stanza_lang, rtl=False, min_support=2, strict=True, case_folding=False, remove_punct=False, temp_fname="gen_templates.txt", sent_fname="gen_sentences.txt", remove_diacritics=True, dot_fix=False, join_char=' ', idf_file=None): def record(key): problems[key] += 1 def print_report(templates): print("{} templates".format(len(templates))) print("{} impossible questions".format(problems['impossible'])) print( "{} possible questions of which {} share the root with the original sentence" .format(problems['possible'], problems['same_root'])) print("{} copula questions".format(problems['copula'])) print("{} have no question templates".format( problems['no_q_template'])) print("{} have no answer templates".format(problems['no_a_template'])) problems = { 'possible': 0, 'impossible': 0, 'same_root': 0, 'copula': 0, 'no_q_template': 0, 'no_a_template': 0 } templates = {} with open(fname) as f: for line in tqdm.tqdm(f): if line.strip(): question, answer, base_sentence = line.split(" #|@ ") if remove_diacritics: question = remove_unicode_diacritics(question) base_sentence = remove_unicode_diacritics(base_sentence) answer = remove_unicode_diacritics(answer) if case_folding: question, answer, base_sentence = question.lower( ), answer.lower(), base_sentence.lower() # lowercasing is a necessary step to mitigate parser's errors if remove_punct: question = remove_unicode_punctuation(question) base_sentence = remove_unicode_punctuation(base_sentence) answer = remove_unicode_punctuation(answer) question, base_sentence, answer = question.strip( ), base_sentence.strip(), answer.strip() if dot_fix: if not is_punctuation(question[-1]): question += "?" if not is_punctuation(base_sentence[-1]): base_sentence += "." # have to proceed through files, because C++ package works with files with open('sentence.conll', 'w') as f1: ss = stanza_lang(base_sentence) conll_list = CoNLL.convert_dict(ss.to_dict()) sentence_tokenized = [ w.text for s in ss.sentences for w in s.words ] f1.write(CoNLL.conll_as_string(conll_list)) with open('question.conll', 'w') as f1: qq = stanza_lang(question) conll_list = CoNLL.convert_dict(qq.to_dict()) question_tokenized = [ w.text for s in qq.sentences for w in s.words ] f1.write(CoNLL.conll_as_string(conll_list)) ud_s = udon2.ConllReader.read_file('sentence.conll')[0] ud_q = udon2.ConllReader.read_file('question.conll')[0] # s_roots = udon2.ConllReader.read_file('sentence.conll') # q_roots = udon2.ConllReader.read_file('question.conll') # ud_s = s_roots[0] # ud_q = q_roots[0] s_root_word = ud_s.children[0] q_root_word = ud_q.children[0] if strict: diff = get_difference(question_tokenized, sentence_tokenized) cond = not diff else: same = get_intersection(question_tokenized, sentence_tokenized) cond = len(same) > 0 if cond: # means there's a direct dependency tree transformation! record('possible') if s_root_word.form.lower() == q_root_word.form.lower(): # many questions that can be asked share the root with a sentence record('same_root') elif q_root_word.prop_exists("deprel", "cop"): # means this is a copula question record('copula') q_temp = generate_question_template(s_root_word, q_root_word, strict=strict, join_char=join_char) to_check = q_temp[:-1] if rtl else q_temp[1:] S_t = sum([type(x) == TemplateElement for x in to_check]) S_nt = len(to_check) - S_t if not q_temp: record('no_q_template') continue if S_t == 0: continue qw = q_temp[-1] if rtl else q_temp[0] if type(qw) == TemplateElement: # the first word is not a constant, so no question word there continue if rtl: qw = q_temp.pop() q_temp.append('<qw>') else: qw = q_temp.pop(0) q_temp.insert(0, '<qw>') a_temp = generate_answer_template(s_root_word, answer, join_char=join_char) if not a_temp: record('no_a_template') continue q_temp, a_temp = normalize_templates(q_temp, a_temp) qtemp_without_qw = join_char.join(map(str, q_temp)) if qtemp_without_qw not in templates: templates[qtemp_without_qw] = { 'question': q_temp, 'all_templates': S_nt == 0, 'answer': a_temp, 'qw': {} } assert templates[qtemp_without_qw]['all_templates'] == ( S_nt == 0), "Inconsistency in templates found" if qw not in templates[qtemp_without_qw]['qw']: templates[qtemp_without_qw]['qw'][qw] = {} atemp_str = join_char.join(map(str, a_temp)) if atemp_str not in templates[qtemp_without_qw]['qw'][qw]: templates[qtemp_without_qw]['qw'][qw][atemp_str] = { 'answer': a_temp, 'examples': [] } templates[qtemp_without_qw]['qw'][qw][atemp_str][ 'examples'].append({ 'sentence': base_sentence.strip(), 'question': question.strip(), 'answer': answer.strip(), 'node': s_root_word.copy( ), # If not copying, then we'll have a memory error, since the associated TreeList will be freed }) # templates[f"{s_root_word.upos} #|@ {str(s_root_word.feats)} #|@ {s_root_word.child_has_prop('deprel', 'aux')} #|@ {non_temp_el} #|@ {q_temp}"][qw][a_temp].add( # base_sentence.strip() + " | " + question.strip() + " | " + answer.strip()) elif strict: record('impossible') idf = load_idf(idf_file) if idf_file else None final_templates, temp_id = [], 1 temp_base = os.path.splitext(os.path.basename(temp_fname))[0] with open(temp_fname, "w") as f, open(sent_fname, 'w') as f1: for _, passport in templates.items(): N_ex = sum([ len(data['examples']) for _, endings in passport['qw'].items() for _, data in endings.items() ]) q_tmpl = join_char.join(map(str, passport['question'])) if passport['all_templates'] or N_ex >= min_support: idfs = [ idf.get(t, float('inf')) for t in passport['question'] if type(t) == str and t != '<qw>' ] max_idf = max(idfs) if idfs else 0 if max_idf <= math.log( 4): # appeared in at least 25% of the documents for qw, endings in passport['qw'].items(): for a_tmpl, data in endings.items(): logging.debug( "-- {} - {} - {} -> PASSED --".format( q_tmpl, passport['all_templates'], N_ex)) final_templates.append({ 'question': q_tmpl.replace("<qw>", qw), 'answer': a_tmpl, 'props': [{ 'pos': x['node'].upos, 'has_aux': x['node'].child_has_prop('deprel', 'aux'), 'feats': x['node'].feats } for x in data['examples']] }) sent = "\n".join([ " | ".join([ x['sentence'], x['question'], x['answer'] ]) for x in data['examples'] ]) tmpl = "{} => {}".format( q_tmpl.replace('<qw>', qw), a_tmpl) f.write("{}\n".format(tmpl)) f1.write("id: {}{}\n{}\n\n".format( temp_base, temp_id, sent)) temp_id += 1 else: logging.debug( "-- {} - {} - {} -> FAILED IDF ({}) --".format( q_tmpl, passport['all_templates'], N_ex, max_idf)) else: logging.debug("-- {} - {} - {} -> FAILED --".format( q_tmpl, passport['all_templates'], N_ex)) print_report(final_templates) return final_templates, temp_fname
def serve(doc, port=5000, RtoL=False): s = str(type(doc)) if s.find("spacy") == 8: c = "" for t in doc: try: m = str(t.morph) if m.startswith("<spacy"): m = "" except: m = "" c += str(t.i + 1) for i in [ t.orth_, t.lemma_, t.pos_, t.tag_, m, str(0 if t.head == t else t.head.i + 1), t.dep_, "" ]: c += "\t_" if i.strip() == "" else "\t" + i if t.ent_iob_ == "B" or t.ent_iob_ == "I": u = "NE=" + t.ent_iob_ + "-" + t.ent_type_ else: u = "" if RtoL and len(t.orth_) > 1: if len([c for c in t.orth_ if ord(c) > 12287]) > 0: u += ("" if u == "" else "|") + "Direction=RtoL" if not t.whitespace_: u += ("" if u == "" else "|") + "SpaceAfter=No" if t.norm_ != "" and t.norm_ != t.orth_: u += ("" if u == "" else "|") + "Translit=" + t.norm_ if u == "": u = "_" c += "\t" + u + "\n" elif s.find("stanza") == 8: from stanza.utils.conll import CoNLL c = CoNLL.conll_as_string(CoNLL.convert_dict(doc.to_dict())) elif s.find("classla") == 8 or s.find("stanfordnlp") == 8: c = doc.conll_file.conll_as_string() elif s.find("nltk") == 8: c = doc.to_conll(10) elif s.find("combo") == 8: from combo.data import sentence2conllu c = sentence2conllu(doc, False).serialize() elif s.find("list") == 8: c = "".join("".join(str(t) + "\n" for t in s) + "\n" for s in doc) else: c = str(doc) if port == None: from IPython.display import IFrame, display from urllib.parse import quote if RtoL: display( IFrame(src=EDITOR_RTOL + "#" + quote(c), width="100%", height="400")) else: display( IFrame(src=EDITOR_URL + "#" + quote(c), width="100%", height="400")) return import sys from http.server import HTTPServer f = TEMPFILE f.seek(0) f.truncate(0) f.write(c.encode("utf-8")) if RtoL: httpd = HTTPServer(("", port), DeplacyRequestHandlerRtoL) else: httpd = HTTPServer(("", port), DeplacyRequestHandler) print("http://127.0.0.1:" + str(port) + " " + VERSION, file=sys.stderr) try: httpd.serve_forever() except: return
if args.max_examples > 0: # sample one gold question and answer q_dict_keys = list(q_dict.keys()) ind = np.random.choice(range(len(q_dict_keys))) gold_q = q_dict_keys[ind] ind_a = np.random.choice(range(len(q_dict[gold_q]))) gold_a = q_dict[gold_q][ind_a] total += len(q_dict) sent = re.sub(r' {2,}', '', sent) stanza_sent = stanza_dep_pipe(sent) with open(fname, 'w') as f: conll_list = CoNLL.convert_dict(stanza_sent.to_dict()) f.write(CoNLL.conll_as_string(conll_list)) trees = udon2.ConllReader.read_file(fname) res = overgenerate_questions(trees, guards_root, templates, template_examples, return_first=False) if res: idx_sorted_by_scores, qwf, atf, scores = rank( res, stanza_pipe, stanza_dep_pipe, qw_stat,
arquivos[arquivo.rsplit(".txt")[0]] = nlp(text) except: sys.stderr.write('\nerro: ' + arquivo) with open( diretorio + "/" + f'{diretorio.rsplit("/", 1)[1] if "/" in diretorio else diretorio}' + ".p", "wb") as f: pickle.dump(arquivos, f) else: with open( diretorio + "/" + f'{diretorio.rsplit("/", 1)[1] if "/" in diretorio else diretorio}' + ".p", "rb") as f: arquivos = pickle.load(f) for arquivo in arquivos: arquivos[arquivo] = CoNLL.convert_dict(arquivos[arquivo].to_dict()) elif os.path.isfile(diretorio): arquivo = diretorio if not os.path.isfile(diretorio + ".p"): with open(arquivo, encoding="utf-8") as f: text = f.read() arquivos[arquivo.rsplit(".txt")[0]] = nlp(text) with open(diretorio + ".p", "wb") as w: pickle.dump(arquivos, w) else: with open(diretorio + ".p", "rb") as f: arquivos = pickle.load(f) for arquivo in arquivos: arquivos[arquivo] = CoNLL.convert_dict(arquivos[arquivo].to_dict()) sentences = []
def test_conllu(processed_doc): assert CoNLL.conll_as_string(CoNLL.convert_dict( processed_doc.to_dict())) == EN_DOC_CONLLU_GOLD
def write_doc_to_file(doc, out_file): conll_string = CoNLL.conll_as_string(CoNLL.convert_dict(doc.to_dict())) with open(str(out_file), "w") as fp: fp.write(conll_string)
# Converts a texed processed by stanza into an NLTK corpus import nltk as nltk from nltk.corpus.reader import conll import stanza as stanza from stanza.utils.conll import CoNLL # stanza spanish tagging nlp = stanza.Pipeline(lang='es') # The input to this is arbitrary, it could be a file if you wanted. doc = nlp("Yo soy Diego. Soy de Puerto Rico.") # Convert to conll format stanza_conll = CoNLL.convert_dict(doc.to_dict()) # Write to conll format file - we could write multiple files for multiple # different input sources here with open('conll.txt', 'w+') as f: f.write(CoNLL.conll_as_string(stanza_conll)) # The columns we want (maybe we can get more info, I'm not sure) COLUMN_TYPES = ( 'ignore', 'words', 'ignore', 'pos', 'ignore', 'ignore', 'ignore',
with open(sys.argv[2] + ".json") as f: tokenized_dict = json.load(f) else: tokenized = [[ token.split("\t")[0] for token in sentence.splitlines() if len(token.split("\t")) > 7 and not '-=' in token.split("\t")[0] ] for sentence in arquivo] print("1/4 dicionário tokenizado: ok") nlp = stanza.Pipeline('pt', tokenize_pretokenized=True) tokenized_nlp = nlp([x for x in tokenized if x]) print("2/4 anotação: ok") tokenized_dict = tokenized_nlp.to_dict() with open(sys.argv[2] + ".json", "w") as f: json.dump(tokenized_dict, f) print(":: checkpoint :: conversão para dict: salva em json") tokenized = CoNLL.convert_dict(tokenized_dict) print("3/4 conversão para CoNLL: ok") sentences = [] for s, sentence in enumerate([x for x in tokenized if x]): metadados = {} #text = arquivo[s].split("# text = ")[1].split("\n")[0] #sent_id = arquivo[s].split("# sent_id = ")[1].split("\n")[0] for token in arquivo[s].splitlines(): #print(token) if token.startswith("# "): metadados[token.split("# ", 1)[1].split(" ")[0]] = token.split(" = ", 1)[1] if '-=' in token: for t, _token in enumerate(sentence):
def test_dict_to_conll(): conll = CoNLL.convert_dict(DICT) assert conll == CONLL
def to_conllu(doc, RtoL=False): s = str(type(doc)) if s.find("spacy") == 8: c = "" for s in doc.sents: for t in s: try: m = str(t.morph) if m.startswith("<spacy"): m = "" except: m = "" c += str(t.i - s.start + 1) for i in [ t.orth_, t.lemma_, t.pos_, t.tag_, m, str(0 if t.head == t else t.head.i - s.start + 1), t.dep_, "" ]: c += "\t_" if i.strip() == "" else "\t" + i if t.ent_iob_ == "B" or t.ent_iob_ == "I": u = "NE=" + t.ent_iob_ + "-" + t.ent_type_ else: u = "" if RtoL and len(t.orth_) > 1: if len([c for c in t.orth_ if ord(c) > 12287]) > 0: u = "Direction=RtoL" if u == "" else "Direction=RtoL|" + u if not t.whitespace_: u += ("" if u == "" else "|") + "SpaceAfter=No" if t.norm_ != "" and t.norm_ != t.orth_: u += ("" if u == "" else "|") + "Translit=" + t.norm_ if u == "": u = "_" c += "\t" + u + "\n" c += "\n" return c elif s.find("stanza") == 8: from stanza.utils.conll import CoNLL return CoNLL.conll_as_string(CoNLL.convert_dict(doc.to_dict())) elif s.find("classla") == 8: return doc.to_conll() elif s.find("stanfordnlp") == 8: return doc.conll_file.conll_as_string() elif s.find("nltk") == 8: return doc.to_conll(10) elif s.find("combo") == 8: from combo.data import sentence2conllu return sentence2conllu(doc, False).serialize() elif s.find("supar") == 8: if hasattr(doc, "sentences"): return "".join([str(s) + "\n" for s in doc.sentences]) else: return str(doc) + "\n" elif s.find("list") == 8: return "".join("".join(str(t) + "\n" for t in s) + "\n" for s in doc) elif s.find("dict") == 8 and "sentences" in doc: from trankit.utils.conll import CoNLL d = [] for s in doc["sentences"]: e = [] for t in s["tokens"]: if "span" in t: i, j = t["span"] t["misc"] = "start_char=" + str(i) + "|end_char=" + str(j) e.append(t) if "expanded" in t: e.extend(t["expanded"]) d.append(list(e)) return CoNLL.conll_as_string(CoNLL.convert_dict(d)) return str(doc)
import io detok = MosesDetokenizer() # English gold_conll_en = "" for s in nltk.corpus.dependency_treebank.parsed_sents()[:200]: gold_conll_en += s.to_conll(10) + '\r\n' nlp = stanza.Pipeline(processors='tokenize,mwt,pos,lemma,depparse') stanza_conll_en = "" for s in nltk.corpus.dependency_treebank.sents()[:200]: sent = detok.detokenize(s) doc = nlp(sent) for s in CoNLL.convert_dict(doc.to_dict()): for w in s: for i, content in enumerate(w): stanza_conll_en += content + '\t' stanza_conll_en = stanza_conll_en[:-1] + '\r\n' stanza_conll_en += '\r\n' f_gold_en = io.StringIO(gold_conll_en) f_stanza_en = io.StringIO(stanza_conll_en) gold_en_eval = load_conllu(f_gold_en) stanza_en_eval = load_conllu(f_stanza_en) stanza_en_evaluation = evaluate(gold_en_eval, stanza_en_eval) print_results(stanza_en_evaluation,