Пример #1
0
def main():
    args = parse_args()
    random.seed(args.seed)

    args = vars(args)

    print("[Launching identity lemmatizer...]")

    if args['mode'] == 'train':
        print(
            "[No training is required; will only generate evaluation output...]"
        )

    document = Document(CoNLL.conll2dict(input_file=args['eval_file']))
    batch = DataLoader(document,
                       args['batch_size'],
                       args,
                       evaluation=True,
                       conll_only=True)
    system_pred_file = args['output_file']
    gold_file = args['gold_file']

    # use identity mapping for prediction
    preds = batch.doc.get([TEXT])

    # write to file and score
    batch.doc.set([LEMMA], preds)
    CoNLL.dict2conll(batch.doc.to_dict(), system_pred_file)
    if gold_file is not None:
        _, _, score = scorer.score(system_pred_file, gold_file)

        print("Lemma score:")
        print("{} {:.2f}".format(args['lang'], score * 100))
Пример #2
0
def makeDoc(doc):
    s = str(type(doc))
    if s.find("spacy") == 8:
        return doc
    elif s.find("stanza") == 8:
        from stanza.utils.conll import CoNLL
        d = CoNLL.conll_as_string(CoNLL.convert_dict(doc.to_dict()))
    elif s.find("classla") == 8 or s.find("stanfordnlp") == 8:
        d = doc.conll_file.conll_as_string()
    elif s.find("nltk") == 8:
        d = doc.to_conll(10)
    elif s.find("combo") == 8:
        from combo.data import sentence2conllu
        d = sentence2conllu(doc, False).serialize()
    elif s.find("list") == 8:
        d = "".join("".join(str(t) + "\n" for t in s) + "\n" for s in doc)
    else:
        d = str(doc)
    DOC = []
    m = []
    misc = ""
    for t in d.split("\n"):
        x = t.split("\t")
        if len(x) != 10:
            continue
        try:
            i, j = int(x[0]), int(x[6])
        except:
            try:
                i = x[0].index("-")
                j = int(x[0][0:i])
                k = int(x[0][i + 1:])
                m.append((len(DOC), j, k, x[1]))
                continue
            except:
                continue
        s = type("", (object, ), {"i": i})
        s.orth_ = x[1]
        s.pos_ = x[3]
        s.head = j
        s.dep_ = x[7]
        s.whitespace_ = (x[9].find("SpaceAfter=No") < 0)
        if s.whitespace_:
            i = x[9].find("start_char=")
            if i >= 0:
                j = x[9].find("|", i)
                k = x[9][i + 5:] if j < 0 else x[9][i + 5:j]
                if misc.find("end" + k) >= 0:
                    DOC[-1].whitespace_ = False
        DOC.append(s)
        misc = x[9]
    for i, j, k, f in reversed(m):
        offset = i - DOC[i].i
        DOC[k + offset].contract = (f, [i + offset for i in range(j, k + 1)])
    for i, t in enumerate(DOC):
        if t.head == 0:
            t.head = t
        else:
            t.head = DOC[i + t.head - t.i]
    return DOC
Пример #3
0
def evaluate(args):
    # file paths
    system_pred_file = args['output_file']
    gold_file = args['gold_file']
    model_file = '{}/{}_lemmatizer.pt'.format(args['model_dir'], args['lang'])

    # load model
    use_cuda = args['cuda'] and not args['cpu']
    trainer = Trainer(model_file=model_file, use_cuda=use_cuda)
    loaded_args, vocab = trainer.args, trainer.vocab

    for k in args:
        if k.endswith('_dir') or k.endswith('_file') or k in ['shorthand']:
            loaded_args[k] = args[k]

    # laod data
    print("Loading data with batch size {}...".format(args['batch_size']))
    doc = Document(CoNLL.conll2dict(input_file=args['eval_file']))
    batch = DataLoader(doc,
                       args['batch_size'],
                       loaded_args,
                       vocab=vocab,
                       evaluation=True)

    # skip eval if dev data does not exist
    if len(batch) == 0:
        print("Skip evaluation because no dev data is available...")
        print("Lemma score:")
        print("{} ".format(args['lang']))
        sys.exit(0)

    dict_preds = trainer.predict_dict(batch.doc.get([TEXT, UPOS]))

    if loaded_args.get('dict_only', False):
        preds = dict_preds
    else:
        print("Running the seq2seq model...")
        preds = []
        edits = []
        for i, b in enumerate(batch):
            ps, es = trainer.predict(b, args['beam_size'])
            preds += ps
            if es is not None:
                edits += es
        preds = trainer.postprocess(batch.doc.get([TEXT]), preds, edits=edits)

        if loaded_args.get('ensemble_dict', False):
            print("[Ensembling dict with seq2seq lemmatizer...]")
            preds = trainer.ensemble(batch.doc.get([TEXT, UPOS]), preds)

    # write to file and score
    batch.doc.set([LEMMA], preds)
    CoNLL.dict2conll(batch.doc.to_dict(), system_pred_file)
    if gold_file is not None:
        _, _, score = scorer.score(system_pred_file, gold_file)

        print("Lemma score:")
        print("{} {:.2f}".format(args['lang'], score * 100))
    def get_connlu_sentence(self, sentence: str) -> str:
        processed_sentence = self._preprocess(sentence)

        doc_response = self._stanford_annotator._annotator(processed_sentence)
        fp, tmp = tempfile.mkstemp()
        CoNLL.write_doc2conll(doc_response, tmp)
        with open(tmp, encoding='utf-8') as f:
            conll_string = f.read()
        return conll_string
Пример #5
0
def evaluate(args):
    # file paths
    system_pred_file = args['output_file']
    gold_file = args['gold_file']
    save_name = args['save_name'] if args[
        'save_name'] else '{}_mwt_expander.pt'.format(args['shorthand'])
    model_file = os.path.join(args['save_dir'], save_name)

    # load model
    use_cuda = args['cuda'] and not args['cpu']
    trainer = Trainer(model_file=model_file, use_cuda=use_cuda)
    loaded_args, vocab = trainer.args, trainer.vocab

    for k in args:
        if k.endswith('_dir') or k.endswith('_file') or k in ['shorthand']:
            loaded_args[k] = args[k]
    logger.debug('max_dec_len: %d' % loaded_args['max_dec_len'])

    # load data
    logger.debug("Loading data with batch size {}...".format(
        args['batch_size']))
    doc = CoNLL.conll2doc(input_file=args['eval_file'])
    batch = DataLoader(doc,
                       args['batch_size'],
                       loaded_args,
                       vocab=vocab,
                       evaluation=True)

    if len(batch) > 0:
        dict_preds = trainer.predict_dict(
            batch.doc.get_mwt_expansions(evaluation=True))
        # decide trainer type and run eval
        if loaded_args['dict_only']:
            preds = dict_preds
        else:
            logger.info("Running the seq2seq model...")
            preds = []
            for i, b in enumerate(batch):
                preds += trainer.predict(b)

            if loaded_args.get('ensemble_dict', False):
                preds = trainer.ensemble(
                    batch.doc.get_mwt_expansions(evaluation=True), preds)
    else:
        # skip eval if dev data does not exist
        preds = []

    # write to file and score
    doc = copy.deepcopy(batch.doc)
    doc.set_mwt_expansions(preds)
    CoNLL.write_doc2conll(doc, system_pred_file)

    if gold_file is not None:
        _, _, score = scorer.score(system_pred_file, gold_file)

        logger.info("MWT expansion score: {} {:.2f}".format(
            args['shorthand'], score * 100))
 def annotate(self, text: str):
     doc_response = self._annotator(text)
     fp, tmp = tempfile.mkstemp()
     CoNLL.write_doc2conll(doc_response, tmp)
     with open(tmp, encoding='utf-8') as f:
         conll_string = f.read()
     return [
         self._sentence_to_df(sentence)
         for sentence in conll_string.split("\n\n") if len(sentence) > 0
     ]
Пример #7
0
def evaluate(args):
    # file paths
    system_pred_file = args['output_file']
    gold_file = args['gold_file']
    model_file = args['save_dir'] + '/' + args['save_name'] if args['save_name'] is not None \
            else '{}/{}_tagger.pt'.format(args['save_dir'], args['shorthand'])

    # load pretrain; note that we allow the pretrain_file to be non-existent
    pretrain_file = '{}/{}.pretrain.pt'.format(args['save_dir'],
                                               args['shorthand'])
    pretrain = Pretrain(pretrain_file)

    # load model
    print("Loading model from: {}".format(model_file))
    use_cuda = args['cuda'] and not args['cpu']
    trainer = Trainer(pretrain=pretrain,
                      model_file=model_file,
                      use_cuda=use_cuda)
    loaded_args, vocab = trainer.args, trainer.vocab

    # load config
    for k in args:
        if k.endswith('_dir') or k.endswith('_file') or k in ['shorthand'
                                                              ] or k == 'mode':
            loaded_args[k] = args[k]

    # load data
    print("Loading data with batch size {}...".format(args['batch_size']))
    doc = Document(CoNLL.conll2dict(input_file=args['eval_file']))
    batch = DataLoader(doc,
                       args['batch_size'],
                       loaded_args,
                       pretrain,
                       vocab=vocab,
                       evaluation=True,
                       sort_during_eval=True)
    if len(batch) > 0:
        print("Start evaluation...")
        preds = []
        for i, b in enumerate(batch):
            preds += trainer.predict(b)
    else:
        # skip eval if dev data does not exist
        preds = []
    preds = utils.unsort(preds, batch.data_orig_idx)

    # write to file and score
    batch.doc.set([UPOS, XPOS, FEATS], [y for x in preds for y in x])
    CoNLL.dict2conll(batch.doc.to_dict(), system_pred_file)

    if gold_file is not None:
        _, _, score = scorer.score(system_pred_file, gold_file)

        print("Tagger score:")
        print("{} {:.2f}".format(args['shorthand'], score * 100))
Пример #8
0
def evaluate(args):
    # file paths
    system_pred_file = args['output_file']
    gold_file = args['gold_file']

    model_file = model_file_name(args)
    # load pretrained vectors if needed
    pretrain = load_pretrain(args)

    # load model
    logger.info("Loading model from: {}".format(model_file))
    use_cuda = args['cuda'] and not args['cpu']
    trainer = Trainer(pretrain=pretrain,
                      model_file=model_file,
                      use_cuda=use_cuda)
    loaded_args, vocab = trainer.args, trainer.vocab

    # load config
    for k in args:
        if k.endswith('_dir') or k.endswith('_file') or k in ['shorthand'
                                                              ] or k == 'mode':
            loaded_args[k] = args[k]

    # load data
    logger.info("Loading data with batch size {}...".format(
        args['batch_size']))
    doc = CoNLL.conll2doc(input_file=args['eval_file'])
    batch = DataLoader(doc,
                       args['batch_size'],
                       loaded_args,
                       pretrain,
                       vocab=vocab,
                       evaluation=True,
                       sort_during_eval=True)

    if len(batch) > 0:
        logger.info("Start evaluation...")
        preds = []
        for i, b in enumerate(batch):
            preds += trainer.predict(b)
    else:
        # skip eval if dev data does not exist
        preds = []
    preds = utils.unsort(preds, batch.data_orig_idx)

    # write to file and score
    batch.doc.set([HEAD, DEPREL], [y for x in preds for y in x])
    CoNLL.write_doc2conll(batch.doc, system_pred_file)

    if gold_file is not None:
        _, _, score = scorer.score(system_pred_file, gold_file)

        logger.info("Parser score:")
        logger.info("{} {:.2f}".format(args['shorthand'], score * 100))
Пример #9
0
 def __init__(self,UniDic,UDPipe):
   self.UniDic=UniDic
   if UniDic!=None:
     d=os.path.join(DOWNLOAD_DIR,UniDic)
     r=os.path.join(PACKAGE_DIR,"mecabrc")
     if os.path.isdir(d):
       try:
         from MeCab import Tagger
       except:
         from fugashi import GenericTagger as Tagger
       self.mecab=Tagger("-r "+r+" -d "+d).parse
     elif UniDic=="unidic-lite":
       try:
         from MeCab import Tagger
       except:
         from fugashi import GenericTagger as Tagger
       import unidic_lite
       self.mecab=Tagger("-r "+r+" -d "+unidic_lite.DICDIR).parse
     elif UniDic=="ipadic":
       try:
         from MeCab import Tagger
       except:
         from fugashi import GenericTagger as Tagger
       try:
         import ipadic
         self.mecab=Tagger(ipadic.MECAB_ARGS).parse
       except:
         self.mecab=Tagger().parse
     else:
       d={ "gendai":"dic1", "spoken":"dic2", "qkana":"dic3", "kindai":"dic4", "kinsei":"dic5", "kyogen":"dic6", "wakan":"dic7", "wabun":"dic8", "manyo":"dic9" }
       self.dictkey=d[UniDic]
       self.mecab=self.ChamameWebAPI
   self.udpipe=self.UDPipeWebAPI
   if UDPipe==None:
     self.model="japanese-gsd"
   else:
     self.model=UDPipe
     m=os.path.join(DOWNLOAD_DIR,self.model+".udpipe")
     if os.path.isfile(m):
       import ufal.udpipe
       self.model=ufal.udpipe.Model.load(m)
       if UniDic==None:
         self.udpipe=ufal.udpipe.Pipeline(self.model,"tokenizer=presegmented","","","").process
       else:
         self.udpipe=ufal.udpipe.Pipeline(self.model,"conllu","none","","").process
     elif self.model.startswith("stanza_"):
       import stanza
       if UniDic==None:
         self.model=stanza.Pipeline(self.model[7:],verbose=False)
         from stanza.utils.conll import CoNLL
         self.udpipe=lambda text:CoNLL.conll_as_string(CoNLL.convert_dict(self.model(text).to_dict()))
       else:
         self.model=stanza.Pipeline(self.model[7:],processors="depparse",depparse_pretagged=True,verbose=False)
         self.udpipe=self.StanzaAPI
Пример #10
0
 def StanzaAPI(self,conllu):
   d=[]
   e=[]
   for s in conllu.split("\n"):
     if s=="" or s.startswith("#"):
       if e!=[]:
         d.append(list(e))
         e=[]
     else:
       t=s.split("\t")
       e.append({"id":int(t[0]),"text":t[1],"lemma":t[2],"upos":t[3],"xpos":t[4],"misc":t[9]})
   from stanza.models.common.doc import Document
   from stanza.utils.conll import CoNLL
   return CoNLL.conll_as_string(CoNLL.convert_dict(self.model(Document(d)).to_dict()))
Пример #11
0
def test_unusual_misc():
    """
    The above RUSSIAN_SAMPLE resulted in a blank misc field in one particular implementation of the conll code
    (the below test would fail)
    """
    doc = CoNLL.conll2doc(input_str=RUSSIAN_SAMPLE)
    sentences = CoNLL.doc2conll(doc)
    assert len(sentences) == 1
    assert len(sentences[0]) == 14

    for word in sentences[0]:
        pieces = word.split("\t")
        assert len(pieces) == 1 or len(pieces) == 10
        if len(pieces) == 10:
            assert all(piece for piece in pieces)
Пример #12
0
def print_conll_sen(sen, sent_id=None, swaps=()):
    out = f'# sent_id = {sent_id}\n# text = {sen.text}\n'
    for fields in CoNLL.convert_dict([sen.to_dict()])[0]:
        for i, j in swaps:
            fields[i], fields[j] = fields[j], fields[i]
        out += "\t".join(fields) + '\n'
    return out
Пример #13
0
def preprocess_to_stream(corpus_filename, lang):
    """Pre-process (tokenize, segment) the specified raw text corpus using the Stanford's Stanza library.

    Args:
        corpus_filename: Filename of the raw text corpus to pre-process.
        lang: Language of Stanza model to use for pre-processing.

    Returns:
        A stream containing the pre-processed text in CoNLL format.
    """
    stanza_pipeline = stanza.Pipeline(lang=lang,
                                      processors='tokenize,mwt',
                                      use_gpu=False)
    with open(corpus_filename, "r") as corpus_file:
        doc = stanza_pipeline(corpus_file.read())

    conll = CoNLL.convert_dict(doc.to_dict())
    conll_stream = StringIO()
    for sent in conll:
        for token in sent:
            print("\t".join(token), file=conll_stream)
        print(file=conll_stream)

    conll_stream.seek(0)

    return conll_stream
Пример #14
0
def check_mwt(filename):
    """
    Checks whether or not there are MWTs in the given conll file
    """
    doc = CoNLL.conll2doc(filename)
    data = doc.get_mwt_expansions(False)
    return len(data) > 0
Пример #15
0
def get_factory(sh, fn):
    print('Resolving vocab option for {}...'.format(sh))
    train_file = 'data/pos/{}.train.in.conllu'.format(sh)
    if not os.path.exists(train_file):
        raise UserWarning(
            'Training data for {} not found in the data directory, falling back to using WordVocab. To generate the '
            'XPOS vocabulary for this treebank properly, please run the following command first:\n'
            '\tstanza/utils/datasets/prepare_pos_treebank.py {}'.format(
                fn, fn))
        # without the training file, there's not much we can do
        key = 'WordVocab(data, shorthand, idx=2)'
        return key

    doc = CoNLL.conll2doc(input_file=train_file)
    data = doc.get([TEXT, UPOS, XPOS, FEATS], as_sentences=True)
    print(f'Original length = {len(data)}')
    data = filter_data(data, idx=2)
    print(f'Filtered length = {len(data)}')
    vocab = WordVocab(data, sh, idx=2, ignore=["_"])
    key = 'WordVocab(data, shorthand, idx=2, ignore=["_"])'
    best_size = len(vocab) - len(VOCAB_PREFIX)
    if best_size > 20:
        for sep in ['', '-', '+', '|', ',', ':']:  # separators
            vocab = XPOSVocab(data, sh, idx=2, sep=sep)
            length = sum(
                len(x) - len(VOCAB_PREFIX) for x in vocab._id2unit.values())
            if length < best_size:
                key = 'XPOSVocab(data, shorthand, idx=2, sep="{}")'.format(sep)
                best_size = length
    return key
Пример #16
0
def main(source, target, language):
    source = Path(source)
    target = Path(target)

    # https://stanfordnlp.github.io/stanza/neural_pipeline.html
    # https://stanfordnlp.github.io/stanza/depparse.html
    nlp = stanza.Pipeline(
        lang=language,
        processors="tokenize,mwt,pos,lemma,depparse",
    )

    # read text file content
    text = source.read_text()
    # process text with Stanza
    doc = nlp(text)
    # write processed document to CoNLL file
    CoNLL.write_doc2conll(doc, target)
Пример #17
0
def parse_conllu(text, english_lines):
    # stanfordnlp.download('zh') # Download the English models
    nlp = stanza.Pipeline(
        "zh",
        processors='tokenize,pos,lemma,depparse',
        tokenize_pretokenized=True,
        tokenize_no_ssplit=True,
        # tokenize_model_path="/Users/loganpeng/Dropbox/Dissertation/code/stanza-train/stanza/saved_models/tokenize/zh_ontonotes_tokenizer.pt",
        pos_pretrain_path=
        "/Users/loganpeng/Dropbox/Dissertation/code/stanza-train/stanza/saved_models/pos/zh_ontonotes.pretrain.pt",
        pos_model_path=
        "/Users/loganpeng/Dropbox/Dissertation/code/stanza-train/stanza/saved_models/pos/zh_ontonotes_tagger.pt",
        depparse_pretrain_path=
        "/Users/loganpeng/Dropbox/Dissertation/code/stanza-train/stanza/saved_models/depparse/zh_ontonotes.pretrain.pt",
        depparse_model_path=
        "/Users/loganpeng/Dropbox/Dissertation/code/stanza-train/stanza/saved_models/depparse/zh_ontonotes_parser.pt"
    )
    # nlp = stanza.Pipeline("zh", processors='tokenize,pos,lemma,depparse', tokenize_pretokenized=True, tokenize_no_ssplit=True)
    ### nlp = stanza.Pipeline(processors='tokenize,pos,lemma,depparse', lang='zh', tokenize_pretokenized=True, use_gpu=True, pos_batch_size=3000)
    conllus = []
    chapter_initials = []
    text_fields = []
    for sid, sentence in enumerate(text):

        # if sid > 10:
        #     continue

        # Get Chapter initial sentences
        chapter_initials.append(
            True if re.match("^[IVX。 ]+$", sentence) else False)
        # Stanza parse UD
        doc = nlp(sentence)  # Run the pipeline on input text
        dicts = doc.to_dict()
        conllu = CoNLL.convert_dict(dicts)
        text_fields.append("".join([x[1] for x in conllu[0]]))
        for tokid in range(len(conllu[0])):
            conllu[0][tokid][3] = xpos2upos_dict[conllu[0][tokid]
                                                 [3]]  # change col3 to upos
            conllu[0][tokid][9] = "_"  # remove start_char and end_char
        conllu = "\n".join(['\t'.join(x) for x in conllu[0]])
        conllus.append(conllu)
        print('o Done parsing sentence %d/%d' % (sid, len(text)), end="\r")

    # assert sum(chapter_initials) == 27

    # Write conllu string
    chapter_no = 1
    conllu_string = ""
    for sent_id in range(len(conllus)):
        if chapter_initials[sent_id]:
            conllu_string += "# newdoc_id = lpp_1943_zh_ch-%.2d\n" % chapter_no
            chapter_no += 1
        conllu_string += "# sent_id = lpp_1943_zh-%d\n# text = %s\n# en_sent_id = lpp_1943.%d\n# en_text = %s\n" % (
            sent_id + 1, text_fields[sent_id], sent_id + 1,
            english_lines[sent_id]) + conllus[sent_id] + "\n\n"

    return conllu_string
Пример #18
0
def main(args):

    sents = load_conllu(args.input)

    stanza_nlp = stanza.Pipeline(
        lang=args.lg,
        dir=args.model,
        package=args.treebank,
        tokenize_no_ssplit=True,
        use_gpu=args.cuda,
    )

    doc = stanza_nlp(sents)
    doc_dict = doc.to_dict()
    conll = CoNLL.convert_dict(doc_dict)
    doc_conll_str = CoNLL.conll_as_string(conll)
    with open(args.output, "w") as wf:
        wf.write(doc_conll_str)
Пример #19
0
def tokenize_with_stanza(homebrew):
    import stanza
    import sqlite3
    import os
    from stanza.utils.conll import CoNLL
    #stanza.download("en") #Beim ersten Ausführen unbedingt einmal Laufen lassen (benötigt Internetverbindung)
    nlp = stanza.Pipeline("en")

    db = sqlite3.connect("Monster.db") #Verbindet die DB
    c = db.cursor()

    folder = "./tagged_stanza" #Erstellt einen Ordner um die TSVs zu speichern
    if not os.path.exists(folder):
        os.makedirs(folder)

    if homebrew == True: #Nur für den filenamen wichtig
        fname_part = "/homebrew_"
        c.execute("SELECT DISTINCT Beschreibung FROM Homebrew")

    else:
        fname_part = "/5eTools_"
        c.execute(
            "SELECT Beschreibung, ID FROM Monster WHERE NOT Beschreibung = 'No information available.' AND NOT Beschreibung = ''")  # Wählt nur relevante Einträge aus der Tabelle

    monstertable = c.fetchall()
    pattern = r'\d+d\d+|initiative|challenge rating|CR\d+|advantage|disadvantage|saving throw| DC |lair action|Enter a description for your Monster here' #Schmeißt Zeilen raus die offensichtlich keine Beschreibung enthalten

    ID = 0
    for monster in monstertable: #Looped über die Ergebnisse des fetchall
        if homebrew:
            if len(monster[0]) > 4:
                #print(len(monster[0]))
                description = monster[0][2:-2]
            else:
                continue
        else:
            description = monster[0]
            ID = monster[1]
        forstanza = ""

        lines = description.split(".")
        for y in lines:
            if not re.search(pattern, y, re.IGNORECASE) and not y == "":
                temp = y.replace(u'\\xa0', ' ').encode('utf-8').decode('utf-8', errors='replace')
                #temp = "".join(y.split())
                forstanza = forstanza + temp

        file = folder + fname_part + str(ID) + ".tsv"
        with open(file,"w",encoding="UTF-8") as f:
            if len(forstanza) > 0 and not forstanza == " ":
                print(forstanza)
                doc = nlp(forstanza)
                conll = CoNLL.convert_dict(doc.to_dict())
                for sentence in conll:
                    for token in sentence:
                        print("\t".join(token), file=f)
        ID = ID +1
Пример #20
0
def process_line(line, nlp):
    try:
        doc = nlp(line)
        conll = CoNLL.convert_dict(doc.to_dict())
        return conll[0]

    except Exception as e:
        #     # print(e)
        return []
Пример #21
0
def get_depd_tree(
    doc: str,
    lg: str,
    stanza_model_path: Path,
    tokenize: bool = True,
    ssplit: bool = False,
    cuda: bool = False,
    verbose: bool = False,
) -> str:

    logging.info(f"generating SUD parse for the input document")

    model_dir = str(stanza_model_path)
    if tokenize and ssplit:
        stanza_nlp = stanza.Pipeline(lang=lg,
                                     dir=model_dir,
                                     use_gpu=cuda,
                                     verbose=verbose)
    elif tokenize:
        stanza_nlp = stanza.Pipeline(
            lang=lg,
            dir=model_dir,
            tokenize_no_ssplit=True,
            use_gpu=cuda,
            verbose=verbose,
        )
    else:
        stanza_nlp = stanza.Pipeline(
            lang=lg,
            dir=model_dir,
            tokenize_pretokenized=True,
            use_gpu=cuda,
            verbose=verbose,
        )

    stanza_doc = stanza_nlp(doc)
    doc_dict = stanza_doc.to_dict()
    conll = CoNLL.convert_dict(doc_dict)
    doc_conll_str = CoNLL.conll_as_string(conll)

    return doc_conll_str
Пример #22
0
def text2ud(id, nlp, text):
    res = []
    doc = nlp(text)
    sentences = doc.sentences
    for i, ud in enumerate(CoNLL.convert_dict(doc.to_dict())):
        res.append("# sent_id = %s.%d" % (id, i))
        res.append("# text = %s" % sentences[i].text)
        for l in ud:
            l[9] = "_"  # ignore last field (start_char,end_char)
            res.append("\t".join(l))
        res.append("")
    return res
Пример #23
0
def test_doc_with_comments():
    """
    Test that a doc with comments gets converted back with comments
    """
    lines = RUSSIAN_SAMPLE.split("\n")

    doc = CoNLL.conll2doc(input_str=RUSSIAN_SAMPLE)
    assert len(doc.sentences) == 1
    assert len(doc.sentences[0].comments) == 3
    assert lines[0] == doc.sentences[0].comments[0]
    assert lines[1] == doc.sentences[0].comments[1]
    assert lines[2] == doc.sentences[0].comments[2]

    sentences = CoNLL.doc2conll(doc)
    assert len(sentences) == 1

    sentence = sentences[0]
    assert len(sentence) == 14
    assert lines[0] == sentence[0]
    assert lines[1] == sentence[1]
    assert lines[2] == sentence[2]
Пример #24
0
    def predict(self, eval_file_or_string):
        eval_file = _read_conllu_arg(eval_file_or_string,
                                     self.feature_config,
                                     predict=True)
        doc = Document(CoNLL.conll2dict(input_file=eval_file))
        batch = DataLoader(doc,
                           self.batch_size,
                           self.loaded_args,
                           self.pretrain,
                           vocab=self.vocab,
                           evaluation=True,
                           sort_during_eval=True)

        preds = []
        if len(batch) > 0:
            for i, b in enumerate(batch):
                preds += self.trainer.predict(b)
        preds = utils.unsort(preds, batch.data_orig_idx)
        batch.doc.set([HEAD, DEPREL], [y for x in preds for y in x])

        doc_conll = CoNLL.convert_dict(batch.doc.to_dict())
        conll_string = CoNLL.conll_as_string(doc_conll)
        return conll_string
Пример #25
0
def lemmatize(lemmatizer, conllu, morphs):
    def clean_final(text):
        finals = {"פ":"ף","כ":"ך","מ":"ם","נ":"ן","צ":"ץ"}
        if text[-1] in finals:
            text = text[:-1] + finals[text[-1]]
        return text

    def post_process(word, pos, lemma, morph):
        if word == lemma:
            if word + "\t" + pos in lex:
                if pos == "VERB" and "Fut" in morph:
                    lemma = lex[word + "\t" + pos]
                if pos == "VERB" and "Pres" in morph:
                    lemma = lex[word + "\t" + pos]
                if pos == "VERB" and "Part" in morph:
                    lemma = lex[word + "\t" + pos]
                if pos in ["NOUN", "ADJ"] and "Plur" in morph:
                    lemma = lex[word + "\t" + pos]
            else:
                if "Plur" in morph and pos in ["NOUN", "ADJ"] and (
                        word.endswith("ים") or word.endswith("ות")):
                    lemma = lemma[:-2]
                    if word.endswith("ות"):
                        lemma += "ה"
                    lemma = clean_final(lemma)
        return lemma

    uposed = [[l.split("\t") for l in s.split("\n")] for s in conllu.strip().split("\n\n")]
    dicts = CoNLL.convert_conll(uposed)
    for sent in dicts:
        for tok in sent:
            tok["id"] = int(tok["id"][0])
    doc = Document(dicts)
    lemmatized = lemmatizer(doc)
    output = []
    counter = 0
    for sent in lemmatized.sentences:
        for tok in sent.tokens:
            word = tok.words[0]
            lemma = word.lemma
            if lemmatizer.do_post_process:
                lemma = post_process(word.text, word.upos, word.lemma, morphs[counter])
            row = [str(word.id), word.text, lemma, word.upos, word.xpos, '_', str(word.head), "_", "_", "_"]
            output.append("\t".join(row))
            counter += 1
        output.append("")
    lemmatized = "\n".join(output)
    lemmatized = get_col(lemmatized,2)

    return lemmatized
Пример #26
0
def test_depparse_with_pretagged_doc():
    nlp = stanza.Pipeline(
        **{
            'processors': 'depparse',
            'dir': TEST_MODELS_DIR,
            'lang': 'en',
            'depparse_pretagged': True
        })

    doc = CoNLL.conll2doc(input_str=EN_DOC_CONLLU_PRETAGGED)
    processed_doc = nlp(doc)

    assert EN_DOC_DEPENDENCY_PARSES_GOLD == '\n\n'.join(
        [sent.dependencies_string() for sent in processed_doc.sentences])
Пример #27
0
    def parse_to_conll(self, fin, nlp_str="stanza"):
        with open(fin, 'r', encoding="utf-8") as readinput:
            ri = readinput.read()

        doc = self.nlp(ri)

        if nlp_str == "stanza":
            dicts = doc.to_dict()
            conll = CoNLL.convert_dict(dicts)
        elif nlp_str == "udpipe":
            stc = self._sentences_to_conllu(doc)
            conll = list(stc)

        return conll
Пример #28
0
def prep_conllu(tb, file_path, overwrite):
    out_file = out_dir.joinpath(file_path.name)
    if out_file.exists() and not overwrite:
        print(f"{out_file.name} exists; skipping")
        return None
    lang, tb, tb_kwargs = determine_treebank(tb)
    if not lang:
        shutil.copy(file_path, out_file)
        return None
    doc = Document(CoNLL.conll2dict(input_file=file_path))
    nlp = stanza.Pipeline(lang=lang,
                          processors='tokenize,mwt,pos',
                          tokenize_pretokenized=True)
    doc = nlp.processors['pos'].process(doc)
    return doc
Пример #29
0
def extract_stanza_conllu(doc, sentid):
    '''
    doc: a stanza doc object for the entire document
    sentid: an integer denoting sentence id in document
            (starts with 0)

    Output: conllu format string output of the sentence
    '''
    input_dict = doc.to_dict()
    conll = CoNLL.convert_dict(input_dict)
    string=""
    for item in conll[sentid]:
        string+=("\t".join(item))
        string+="\n"
    return string
def main():
    args = arguments()
    stanza.download(args.language)
    nlp = stanza.Pipeline(args.language,
                          processors="tokenize,mwt,pos,lemma,depparse")
    for fh in args.TEXT:
        filename = os.path.basename(fh.name)
        text = fh.read()
        doc = nlp(text)
        dicts = doc.to_dict()
        conll = CoNLL.convert_dict(dicts)
        with open(os.path.join(args.output_dir, filename + ".conllu"),
                  mode="w",
                  encoding="utf-8") as out:
            for sentence in conll:
                out.write("\n".join(("\t".join(token) for token in sentence)))
                out.write("\n\n")