예제 #1
0
def makeDoc(doc):
    s = str(type(doc))
    if s.find("spacy") == 8:
        return doc
    elif s.find("stanza") == 8:
        from stanza.utils.conll import CoNLL
        d = CoNLL.conll_as_string(CoNLL.convert_dict(doc.to_dict()))
    elif s.find("classla") == 8 or s.find("stanfordnlp") == 8:
        d = doc.conll_file.conll_as_string()
    elif s.find("nltk") == 8:
        d = doc.to_conll(10)
    elif s.find("combo") == 8:
        from combo.data import sentence2conllu
        d = sentence2conllu(doc, False).serialize()
    elif s.find("list") == 8:
        d = "".join("".join(str(t) + "\n" for t in s) + "\n" for s in doc)
    else:
        d = str(doc)
    DOC = []
    m = []
    misc = ""
    for t in d.split("\n"):
        x = t.split("\t")
        if len(x) != 10:
            continue
        try:
            i, j = int(x[0]), int(x[6])
        except:
            try:
                i = x[0].index("-")
                j = int(x[0][0:i])
                k = int(x[0][i + 1:])
                m.append((len(DOC), j, k, x[1]))
                continue
            except:
                continue
        s = type("", (object, ), {"i": i})
        s.orth_ = x[1]
        s.pos_ = x[3]
        s.head = j
        s.dep_ = x[7]
        s.whitespace_ = (x[9].find("SpaceAfter=No") < 0)
        if s.whitespace_:
            i = x[9].find("start_char=")
            if i >= 0:
                j = x[9].find("|", i)
                k = x[9][i + 5:] if j < 0 else x[9][i + 5:j]
                if misc.find("end" + k) >= 0:
                    DOC[-1].whitespace_ = False
        DOC.append(s)
        misc = x[9]
    for i, j, k, f in reversed(m):
        offset = i - DOC[i].i
        DOC[k + offset].contract = (f, [i + offset for i in range(j, k + 1)])
    for i, t in enumerate(DOC):
        if t.head == 0:
            t.head = t
        else:
            t.head = DOC[i + t.head - t.i]
    return DOC
예제 #2
0
 def __init__(self,UniDic,UDPipe):
   self.UniDic=UniDic
   if UniDic!=None:
     d=os.path.join(DOWNLOAD_DIR,UniDic)
     r=os.path.join(PACKAGE_DIR,"mecabrc")
     if os.path.isdir(d):
       try:
         from MeCab import Tagger
       except:
         from fugashi import GenericTagger as Tagger
       self.mecab=Tagger("-r "+r+" -d "+d).parse
     elif UniDic=="unidic-lite":
       try:
         from MeCab import Tagger
       except:
         from fugashi import GenericTagger as Tagger
       import unidic_lite
       self.mecab=Tagger("-r "+r+" -d "+unidic_lite.DICDIR).parse
     elif UniDic=="ipadic":
       try:
         from MeCab import Tagger
       except:
         from fugashi import GenericTagger as Tagger
       try:
         import ipadic
         self.mecab=Tagger(ipadic.MECAB_ARGS).parse
       except:
         self.mecab=Tagger().parse
     else:
       d={ "gendai":"dic1", "spoken":"dic2", "qkana":"dic3", "kindai":"dic4", "kinsei":"dic5", "kyogen":"dic6", "wakan":"dic7", "wabun":"dic8", "manyo":"dic9" }
       self.dictkey=d[UniDic]
       self.mecab=self.ChamameWebAPI
   self.udpipe=self.UDPipeWebAPI
   if UDPipe==None:
     self.model="japanese-gsd"
   else:
     self.model=UDPipe
     m=os.path.join(DOWNLOAD_DIR,self.model+".udpipe")
     if os.path.isfile(m):
       import ufal.udpipe
       self.model=ufal.udpipe.Model.load(m)
       if UniDic==None:
         self.udpipe=ufal.udpipe.Pipeline(self.model,"tokenizer=presegmented","","","").process
       else:
         self.udpipe=ufal.udpipe.Pipeline(self.model,"conllu","none","","").process
     elif self.model.startswith("stanza_"):
       import stanza
       if UniDic==None:
         self.model=stanza.Pipeline(self.model[7:],verbose=False)
         from stanza.utils.conll import CoNLL
         self.udpipe=lambda text:CoNLL.conll_as_string(CoNLL.convert_dict(self.model(text).to_dict()))
       else:
         self.model=stanza.Pipeline(self.model[7:],processors="depparse",depparse_pretagged=True,verbose=False)
         self.udpipe=self.StanzaAPI
예제 #3
0
 def StanzaAPI(self,conllu):
   d=[]
   e=[]
   for s in conllu.split("\n"):
     if s=="" or s.startswith("#"):
       if e!=[]:
         d.append(list(e))
         e=[]
     else:
       t=s.split("\t")
       e.append({"id":int(t[0]),"text":t[1],"lemma":t[2],"upos":t[3],"xpos":t[4],"misc":t[9]})
   from stanza.models.common.doc import Document
   from stanza.utils.conll import CoNLL
   return CoNLL.conll_as_string(CoNLL.convert_dict(self.model(Document(d)).to_dict()))
예제 #4
0
def main(args):

    sents = load_conllu(args.input)

    stanza_nlp = stanza.Pipeline(
        lang=args.lg,
        dir=args.model,
        package=args.treebank,
        tokenize_no_ssplit=True,
        use_gpu=args.cuda,
    )

    doc = stanza_nlp(sents)
    doc_dict = doc.to_dict()
    conll = CoNLL.convert_dict(doc_dict)
    doc_conll_str = CoNLL.conll_as_string(conll)
    with open(args.output, "w") as wf:
        wf.write(doc_conll_str)
예제 #5
0
def get_depd_tree(
    doc: str,
    lg: str,
    stanza_model_path: Path,
    tokenize: bool = True,
    ssplit: bool = False,
    cuda: bool = False,
    verbose: bool = False,
) -> str:

    logging.info(f"generating SUD parse for the input document")

    model_dir = str(stanza_model_path)
    if tokenize and ssplit:
        stanza_nlp = stanza.Pipeline(lang=lg,
                                     dir=model_dir,
                                     use_gpu=cuda,
                                     verbose=verbose)
    elif tokenize:
        stanza_nlp = stanza.Pipeline(
            lang=lg,
            dir=model_dir,
            tokenize_no_ssplit=True,
            use_gpu=cuda,
            verbose=verbose,
        )
    else:
        stanza_nlp = stanza.Pipeline(
            lang=lg,
            dir=model_dir,
            tokenize_pretokenized=True,
            use_gpu=cuda,
            verbose=verbose,
        )

    stanza_doc = stanza_nlp(doc)
    doc_dict = stanza_doc.to_dict()
    conll = CoNLL.convert_dict(doc_dict)
    doc_conll_str = CoNLL.conll_as_string(conll)

    return doc_conll_str
예제 #6
0
    def predict(self, eval_file_or_string):
        eval_file = _read_conllu_arg(eval_file_or_string,
                                     self.feature_config,
                                     predict=True)
        doc = Document(CoNLL.conll2dict(input_file=eval_file))
        batch = DataLoader(doc,
                           self.batch_size,
                           self.loaded_args,
                           self.pretrain,
                           vocab=self.vocab,
                           evaluation=True,
                           sort_during_eval=True)

        preds = []
        if len(batch) > 0:
            for i, b in enumerate(batch):
                preds += self.trainer.predict(b)
        preds = utils.unsort(preds, batch.data_orig_idx)
        batch.doc.set([HEAD, DEPREL], [y for x in preds for y in x])

        doc_conll = CoNLL.convert_dict(batch.doc.to_dict())
        conll_string = CoNLL.conll_as_string(doc_conll)
        return conll_string
예제 #7
0
def serve(doc, port=5000, RtoL=False):
    s = str(type(doc))
    if s.find("spacy") == 8:
        c = ""
        for t in doc:
            try:
                m = str(t.morph)
                if m.startswith("<spacy"):
                    m = ""
            except:
                m = ""
            c += str(t.i + 1)
            for i in [
                    t.orth_, t.lemma_, t.pos_, t.tag_, m,
                    str(0 if t.head == t else t.head.i + 1), t.dep_, ""
            ]:
                c += "\t_" if i.strip() == "" else "\t" + i
            if t.ent_iob_ == "B" or t.ent_iob_ == "I":
                u = "NE=" + t.ent_iob_ + "-" + t.ent_type_
            else:
                u = ""
            if RtoL and len(t.orth_) > 1:
                if len([c for c in t.orth_ if ord(c) > 12287]) > 0:
                    u += ("" if u == "" else "|") + "Direction=RtoL"
            if not t.whitespace_:
                u += ("" if u == "" else "|") + "SpaceAfter=No"
            if t.norm_ != "" and t.norm_ != t.orth_:
                u += ("" if u == "" else "|") + "Translit=" + t.norm_
            if u == "":
                u = "_"
            c += "\t" + u + "\n"
    elif s.find("stanza") == 8:
        from stanza.utils.conll import CoNLL
        c = CoNLL.conll_as_string(CoNLL.convert_dict(doc.to_dict()))
    elif s.find("classla") == 8 or s.find("stanfordnlp") == 8:
        c = doc.conll_file.conll_as_string()
    elif s.find("nltk") == 8:
        c = doc.to_conll(10)
    elif s.find("combo") == 8:
        from combo.data import sentence2conllu
        c = sentence2conllu(doc, False).serialize()
    elif s.find("list") == 8:
        c = "".join("".join(str(t) + "\n" for t in s) + "\n" for s in doc)
    else:
        c = str(doc)
    if port == None:
        from IPython.display import IFrame, display
        from urllib.parse import quote
        if RtoL:
            display(
                IFrame(src=EDITOR_RTOL + "#" + quote(c),
                       width="100%",
                       height="400"))
        else:
            display(
                IFrame(src=EDITOR_URL + "#" + quote(c),
                       width="100%",
                       height="400"))
        return
    import sys
    from http.server import HTTPServer
    f = TEMPFILE
    f.seek(0)
    f.truncate(0)
    f.write(c.encode("utf-8"))
    if RtoL:
        httpd = HTTPServer(("", port), DeplacyRequestHandlerRtoL)
    else:
        httpd = HTTPServer(("", port), DeplacyRequestHandler)
    print("http://127.0.0.1:" + str(port) + "   " + VERSION, file=sys.stderr)
    try:
        httpd.serve_forever()
    except:
        return
예제 #8
0
def generate_templates(fname,
                       stanza_lang,
                       rtl=False,
                       min_support=2,
                       strict=True,
                       case_folding=False,
                       remove_punct=False,
                       temp_fname="gen_templates.txt",
                       sent_fname="gen_sentences.txt",
                       remove_diacritics=True,
                       dot_fix=False,
                       join_char=' ',
                       idf_file=None):
    def record(key):
        problems[key] += 1

    def print_report(templates):
        print("{} templates".format(len(templates)))
        print("{} impossible questions".format(problems['impossible']))
        print(
            "{} possible questions of which {} share the root with the original sentence"
            .format(problems['possible'], problems['same_root']))
        print("{} copula questions".format(problems['copula']))
        print("{} have no question templates".format(
            problems['no_q_template']))
        print("{} have no answer templates".format(problems['no_a_template']))

    problems = {
        'possible': 0,
        'impossible': 0,
        'same_root': 0,
        'copula': 0,
        'no_q_template': 0,
        'no_a_template': 0
    }

    templates = {}
    with open(fname) as f:
        for line in tqdm.tqdm(f):
            if line.strip():
                question, answer, base_sentence = line.split(" #|@ ")

                if remove_diacritics:
                    question = remove_unicode_diacritics(question)
                    base_sentence = remove_unicode_diacritics(base_sentence)
                    answer = remove_unicode_diacritics(answer)

                if case_folding:
                    question, answer, base_sentence = question.lower(
                    ), answer.lower(), base_sentence.lower()

                # lowercasing is a necessary step to mitigate parser's errors
                if remove_punct:
                    question = remove_unicode_punctuation(question)
                    base_sentence = remove_unicode_punctuation(base_sentence)
                    answer = remove_unicode_punctuation(answer)

                question, base_sentence, answer = question.strip(
                ), base_sentence.strip(), answer.strip()
                if dot_fix:
                    if not is_punctuation(question[-1]):
                        question += "?"
                    if not is_punctuation(base_sentence[-1]):
                        base_sentence += "."

                # have to proceed through files, because C++ package works with files
                with open('sentence.conll', 'w') as f1:
                    ss = stanza_lang(base_sentence)
                    conll_list = CoNLL.convert_dict(ss.to_dict())
                    sentence_tokenized = [
                        w.text for s in ss.sentences for w in s.words
                    ]
                    f1.write(CoNLL.conll_as_string(conll_list))

                with open('question.conll', 'w') as f1:
                    qq = stanza_lang(question)
                    conll_list = CoNLL.convert_dict(qq.to_dict())
                    question_tokenized = [
                        w.text for s in qq.sentences for w in s.words
                    ]
                    f1.write(CoNLL.conll_as_string(conll_list))

                ud_s = udon2.ConllReader.read_file('sentence.conll')[0]
                ud_q = udon2.ConllReader.read_file('question.conll')[0]

                # s_roots = udon2.ConllReader.read_file('sentence.conll')
                # q_roots = udon2.ConllReader.read_file('question.conll')
                # ud_s = s_roots[0]
                # ud_q = q_roots[0]

                s_root_word = ud_s.children[0]
                q_root_word = ud_q.children[0]

                if strict:
                    diff = get_difference(question_tokenized,
                                          sentence_tokenized)
                    cond = not diff
                else:
                    same = get_intersection(question_tokenized,
                                            sentence_tokenized)
                    cond = len(same) > 0

                if cond:
                    # means there's a direct dependency tree transformation!
                    record('possible')
                    if s_root_word.form.lower() == q_root_word.form.lower():
                        # many questions that can be asked share the root with a sentence
                        record('same_root')
                    elif q_root_word.prop_exists("deprel", "cop"):
                        # means this is a copula question
                        record('copula')

                    q_temp = generate_question_template(s_root_word,
                                                        q_root_word,
                                                        strict=strict,
                                                        join_char=join_char)

                    to_check = q_temp[:-1] if rtl else q_temp[1:]
                    S_t = sum([type(x) == TemplateElement for x in to_check])
                    S_nt = len(to_check) - S_t

                    if not q_temp:
                        record('no_q_template')
                        continue
                    if S_t == 0:
                        continue

                    qw = q_temp[-1] if rtl else q_temp[0]
                    if type(qw) == TemplateElement:
                        # the first word is not a constant, so no question word there
                        continue

                    if rtl:
                        qw = q_temp.pop()
                        q_temp.append('<qw>')
                    else:
                        qw = q_temp.pop(0)
                        q_temp.insert(0, '<qw>')

                    a_temp = generate_answer_template(s_root_word,
                                                      answer,
                                                      join_char=join_char)
                    if not a_temp:
                        record('no_a_template')
                        continue

                    q_temp, a_temp = normalize_templates(q_temp, a_temp)
                    qtemp_without_qw = join_char.join(map(str, q_temp))

                    if qtemp_without_qw not in templates:
                        templates[qtemp_without_qw] = {
                            'question': q_temp,
                            'all_templates': S_nt == 0,
                            'answer': a_temp,
                            'qw': {}
                        }

                    assert templates[qtemp_without_qw]['all_templates'] == (
                        S_nt == 0), "Inconsistency in templates found"

                    if qw not in templates[qtemp_without_qw]['qw']:
                        templates[qtemp_without_qw]['qw'][qw] = {}

                    atemp_str = join_char.join(map(str, a_temp))
                    if atemp_str not in templates[qtemp_without_qw]['qw'][qw]:
                        templates[qtemp_without_qw]['qw'][qw][atemp_str] = {
                            'answer': a_temp,
                            'examples': []
                        }

                    templates[qtemp_without_qw]['qw'][qw][atemp_str][
                        'examples'].append({
                            'sentence': base_sentence.strip(),
                            'question': question.strip(),
                            'answer': answer.strip(),
                            'node': s_root_word.copy(
                            ),  # If not copying, then we'll have a memory error, since the associated TreeList will be freed
                        })

                    # templates[f"{s_root_word.upos} #|@ {str(s_root_word.feats)} #|@ {s_root_word.child_has_prop('deprel', 'aux')} #|@ {non_temp_el} #|@ {q_temp}"][qw][a_temp].add(
                    #     base_sentence.strip() + " | " + question.strip() + " | " + answer.strip())
                elif strict:
                    record('impossible')

    idf = load_idf(idf_file) if idf_file else None

    final_templates, temp_id = [], 1
    temp_base = os.path.splitext(os.path.basename(temp_fname))[0]
    with open(temp_fname, "w") as f, open(sent_fname, 'w') as f1:
        for _, passport in templates.items():
            N_ex = sum([
                len(data['examples']) for _, endings in passport['qw'].items()
                for _, data in endings.items()
            ])

            q_tmpl = join_char.join(map(str, passport['question']))

            if passport['all_templates'] or N_ex >= min_support:
                idfs = [
                    idf.get(t, float('inf')) for t in passport['question']
                    if type(t) == str and t != '<qw>'
                ]
                max_idf = max(idfs) if idfs else 0
                if max_idf <= math.log(
                        4):  # appeared in at least 25% of the documents
                    for qw, endings in passport['qw'].items():
                        for a_tmpl, data in endings.items():
                            logging.debug(
                                "-- {} - {} - {} -> PASSED --".format(
                                    q_tmpl, passport['all_templates'], N_ex))

                            final_templates.append({
                                'question':
                                q_tmpl.replace("<qw>", qw),
                                'answer':
                                a_tmpl,
                                'props': [{
                                    'pos':
                                    x['node'].upos,
                                    'has_aux':
                                    x['node'].child_has_prop('deprel', 'aux'),
                                    'feats':
                                    x['node'].feats
                                } for x in data['examples']]
                            })

                            sent = "\n".join([
                                " | ".join([
                                    x['sentence'], x['question'], x['answer']
                                ]) for x in data['examples']
                            ])

                            tmpl = "{} => {}".format(
                                q_tmpl.replace('<qw>', qw), a_tmpl)
                            f.write("{}\n".format(tmpl))
                            f1.write("id: {}{}\n{}\n\n".format(
                                temp_base, temp_id, sent))
                            temp_id += 1
                else:
                    logging.debug(
                        "-- {} - {} - {} -> FAILED IDF ({}) --".format(
                            q_tmpl, passport['all_templates'], N_ex, max_idf))
            else:
                logging.debug("-- {} - {} - {} -> FAILED --".format(
                    q_tmpl, passport['all_templates'], N_ex))

    print_report(final_templates)

    return final_templates, temp_fname
예제 #9
0
        if args.max_examples > 0:
            # sample one gold question and answer
            q_dict_keys = list(q_dict.keys())
            ind = np.random.choice(range(len(q_dict_keys)))
            gold_q = q_dict_keys[ind]
            ind_a = np.random.choice(range(len(q_dict[gold_q])))
            gold_a = q_dict[gold_q][ind_a]

        total += len(q_dict)

        sent = re.sub(r' {2,}', '', sent)

        stanza_sent = stanza_dep_pipe(sent)
        with open(fname, 'w') as f:
            conll_list = CoNLL.convert_dict(stanza_sent.to_dict())
            f.write(CoNLL.conll_as_string(conll_list))
        trees = udon2.ConllReader.read_file(fname)

        res = overgenerate_questions(trees,
                                     guards_root,
                                     templates,
                                     template_examples,
                                     return_first=False)

        if res:
            idx_sorted_by_scores, qwf, atf, scores = rank(
                res,
                stanza_pipe,
                stanza_dep_pipe,
                qw_stat,
                a_tmpl,
예제 #10
0
def write_doc_to_file(doc, out_file):
    conll_string = CoNLL.conll_as_string(CoNLL.convert_dict(doc.to_dict()))
    with open(str(out_file), "w") as fp:
        fp.write(conll_string)
예제 #11
0
def test_conllu(processed_doc):
    assert CoNLL.conll_as_string(CoNLL.convert_dict(
        processed_doc.to_dict())) == EN_DOC_CONLLU_GOLD
예제 #12
0
from stanza.utils.conll import CoNLL

# stanza spanish tagging
nlp = stanza.Pipeline(lang='es')

# The input to this is arbitrary, it could be a file if you wanted.
doc = nlp("Yo soy Diego. Soy de Puerto Rico.")

# Convert to conll format
stanza_conll = CoNLL.convert_dict(doc.to_dict())

# Write to conll format file - we could write multiple files for multiple
# different input sources here
with open('conll.txt', 'w+') as f:
    f.write(CoNLL.conll_as_string(stanza_conll))

# The columns we want (maybe we can get more info, I'm not sure)
COLUMN_TYPES = (
    'ignore',
    'words',
    'ignore',
    'pos',
    'ignore',
    'ignore',
    'ignore',
    'ignore',
    'ignore',
    'ignore',
)
예제 #13
0
def to_conllu(doc, RtoL=False):
    s = str(type(doc))
    if s.find("spacy") == 8:
        c = ""
        for s in doc.sents:
            for t in s:
                try:
                    m = str(t.morph)
                    if m.startswith("<spacy"):
                        m = ""
                except:
                    m = ""
                c += str(t.i - s.start + 1)
                for i in [
                        t.orth_, t.lemma_, t.pos_, t.tag_, m,
                        str(0 if t.head == t else t.head.i - s.start + 1),
                        t.dep_, ""
                ]:
                    c += "\t_" if i.strip() == "" else "\t" + i
                if t.ent_iob_ == "B" or t.ent_iob_ == "I":
                    u = "NE=" + t.ent_iob_ + "-" + t.ent_type_
                else:
                    u = ""
                if RtoL and len(t.orth_) > 1:
                    if len([c for c in t.orth_ if ord(c) > 12287]) > 0:
                        u = "Direction=RtoL" if u == "" else "Direction=RtoL|" + u
                if not t.whitespace_:
                    u += ("" if u == "" else "|") + "SpaceAfter=No"
                if t.norm_ != "" and t.norm_ != t.orth_:
                    u += ("" if u == "" else "|") + "Translit=" + t.norm_
                if u == "":
                    u = "_"
                c += "\t" + u + "\n"
            c += "\n"
        return c
    elif s.find("stanza") == 8:
        from stanza.utils.conll import CoNLL
        return CoNLL.conll_as_string(CoNLL.convert_dict(doc.to_dict()))
    elif s.find("classla") == 8:
        return doc.to_conll()
    elif s.find("stanfordnlp") == 8:
        return doc.conll_file.conll_as_string()
    elif s.find("nltk") == 8:
        return doc.to_conll(10)
    elif s.find("combo") == 8:
        from combo.data import sentence2conllu
        return sentence2conllu(doc, False).serialize()
    elif s.find("supar") == 8:
        if hasattr(doc, "sentences"):
            return "".join([str(s) + "\n" for s in doc.sentences])
        else:
            return str(doc) + "\n"
    elif s.find("list") == 8:
        return "".join("".join(str(t) + "\n" for t in s) + "\n" for s in doc)
    elif s.find("dict") == 8 and "sentences" in doc:
        from trankit.utils.conll import CoNLL
        d = []
        for s in doc["sentences"]:
            e = []
            for t in s["tokens"]:
                if "span" in t:
                    i, j = t["span"]
                    t["misc"] = "start_char=" + str(i) + "|end_char=" + str(j)
                e.append(t)
                if "expanded" in t:
                    e.extend(t["expanded"])
            d.append(list(e))
        return CoNLL.conll_as_string(CoNLL.convert_dict(d))
    return str(doc)