예제 #1
0
def create_udpipe_pipeline(lang: str) -> UDPipeLanguage:
    try:
        pipeline = spacy_udpipe.load(lang)
    except:
        spacy_udpipe.download(lang)
        pipeline = spacy_udpipe.load(lang)
    if pipeline is None:
        del pipeline
        raise ValueError('The `{0}` language cannot be loaded for the UDPipe!')
    return pipeline
예제 #2
0
 def __init__(self, lang='EN'):
     self.lang = lang
     self.tokenize_functions = {
         "EN": self._tokenizeDataEN,
         "CS": self._tokenizeDataCS
     }
     self.is_trained = False  # todo
     self.nlp = spacy_udpipe.load("en") if lang == 'EN' else spacy_udpipe.load("cs")
     self.default_stop_words = False
     self.stop_words = []
     self._vectorizer = None
예제 #3
0
def test_pipe(lang: str) -> None:
    nlp = load(lang=lang)

    text = "spacy-udpipe still does not support multiprocess execution."
    doc = nlp(text)
    del nlp

    nlp = load(lang=lang)
    texts = [text for _ in range(2)]
    docs = list(nlp.pipe(texts, n_process=-1))

    assert len(docs) == len(texts)
    assert docs[0].to_json() == doc.to_json()
    assert docs[-1].to_json() == doc.to_json()
예제 #4
0
def get_pos_ud_head(training_data, lang='en', document=None):

    try:
        ud_model = spacy_udpipe.load(lang)
        print(lang, ' model is used.')

    except Exception:

        spacy_udpipe.download(lang)
        print('downloaded model: ', lang)

        ud_model = spacy_udpipe.load(lang)

    sent_pos = []
    sent_ud = []
    sent_head = []
    sent_tok = []

    # get pos and ud tag
    for line in training_data:

        #print(line)
        temp_pos = []
        temp_ud = []
        temp_head = []
        temp_tok = []
        #print(line)
        tag_sent = ud_model(line)
        for i, token in enumerate(tag_sent):
            temp_pos.append(token.pos_)
            temp_ud.append(token.dep_)
            temp_tok.append(token.text)

            # if token.head == token:
            #     head = 0
            #     temp_head.append(head)
            #     print(token, token.head, head)
            # else:

            head = token.head.i - tag_sent[0].i + 1
            temp_head.append(head)
            #print(token, token.head, head)

        sent_pos.append(temp_pos)
        sent_ud.append(temp_ud)
        sent_head.append(temp_head)
        sent_tok.append(temp_tok)

    return sent_pos, sent_ud, sent_head, sent_tok
예제 #5
0
def test_serialization(lang: str) -> None:
    with tempfile.TemporaryDirectory() as tdir:
        nlp = load(lang=lang)
        nlp.to_disk(tdir)

        udpipe_model = UDPipeModel(lang=lang)
        nlp = spacy.load(tdir, udpipe_model=udpipe_model)
예제 #6
0
def main():
    docs_dir = '/home/zal/Devel/OperaSpNLP/docs'
    output_dir = '/home/zal/Devel/OperaSpNLP/output/udpipe/ner'
    file_ext = 'txt'

    nlp = spacy_udpipe.load('es-ancora')

    entity_dict = defaultdict(list)

    for file_path in glob(osp.join(docs_dir, f'*.{file_ext}')):
        doc_text = unidecode(open(file_path, 'r').read())
        doc = nlp(doc_text)
        for ent in doc.ents:
            entity_dict[ent.label_].append(ent.text)

    print(f'Total labels: {len(entity_dict.keys())}')
    print(f'{entity_dict.keys()}')

    for ner_label in entity_dict.keys():
        print(
            f'===================================={ner_label}: {len(entity_dict[ner_label])}'
        )
        ner_label_count_tuple = count_and_tuple(entity_dict[ner_label])
        print_count_tuple(ner_label_count_tuple)
        save_path = osp.join(output_dir, f'{ner_label}.out')
        save_count_tuple(ner_label_count_tuple, save_path)
예제 #7
0
 def _default_tagger(self):
     try:
         import spacy_udpipe
     except ImportError:
         raise (
             'You are missing pos tagger, try `pip install spacy_udpipe`')
     spacy_udpipe.download('en')
     return spacy_udpipe.load('en')
예제 #8
0
def _init(lang='cs', download_model=None):
    # ('cs', 'cs-pdt', 'cs-cac', 'cs-fictree', 'cs-cltt'):
    if download_model:
        spacy_udpipe.download(download_model)
    nlp = spacy_udpipe.load(lang)
    syllables = SpacySyllables(nlp, lang=lang)
    nlp.add_pipe(syllables)
    return nlp
예제 #9
0
def test_morph_exception() -> None:
    assert spacy.__version__ <= SPACY_VERSION

    lang = RO
    text = "Ce mai faci?"

    download(lang=lang)

    try:
        nlp = load(lang=lang)
        assert nlp._meta["lang"] == f"udpipe_{lang}"
        doc = nlp(text)
    except ValueError:
        nlp = load(lang=lang, ignore_tag_map=True)
        assert nlp._meta["lang"] == f"udpipe_{lang}"
        doc = nlp(text)

    assert doc
예제 #10
0
def test_feats() -> None:
    lang = RU
    text = "Я люблю машинное обучение."

    download(lang=lang)

    nlp = load(lang=lang)
    assert nlp._meta["lang"] == f"udpipe_{lang}"
    doc = nlp(text)
    assert doc[2]._.feats == "Case=Acc|Degree=Pos|Gender=Neut|Number=Sing"
예제 #11
0
 def __init__(self):
     with resources.path("src.resources",
                         "it-sentiment_lexicon.lmf.xml") as bad_words:
         self.__bad_words = {
             word.rstrip().lower()
             for word in open(bad_words, 'r', encoding='utf8')
             if word.rstrip().lower() != ''
         }
     self.__stemmer = SnowballStemmer('italian')
     self.__nlp = spacy_udpipe.load("it-postwita")
예제 #12
0
def test_serialization(lang: str) -> None:
    with tempfile.TemporaryDirectory() as tdir:
        nlp = load(lang=lang)
        doc = nlp("A simple sentence.")
        nlp.to_disk(tdir)
        del nlp

        nlp = spacy.load(tdir)
        same_doc = nlp("A simple sentence.")

        assert doc.to_json() == same_doc.to_json()
예제 #13
0
    def __init__(self, lang_or_model, nlp_str):

        if nlp_str == "stanza":
            self.nlp = stanza.Pipeline(
                lang_or_model,
                processors='tokenize,pos,lemma,depparse',
                use_gpu=True,
                pos_batch_size=2000,
                depparse_batch_size=2000)
        elif nlp_str == "udpipe":
            self.nlp = spacy_udpipe.load(lang_or_model)
            self.tagmap = self.nlp.vocab.morphology.tag_map
예제 #14
0
def main(path_to_input, out_dir=''):
    """ Generate plot for an input file or multiple files in a folder.
    """
    nlp = spacy_udpipe.load('nb')
    if not os.path.isdir(out_dir):
        os.mkdir(out_dir)
    if os.path.isdir(path_to_input):  # folder as input
        for rel_f in sorted(os.listdir(path_to_input)):
            file_name, ext = os.path.splitext(rel_f)
            out_f = os.path.join(out_dir, file_name + ".png")
            if ext == ".uio":
                path_to_file = os.path.join(path_to_input, rel_f)
                pedigree = rel_to_linkage(path_to_file, nlp, out_dir)
                linkage_f = os.path.join(out_dir, file_name + '.ped')
                names = ",".join(list(pedigree.id_mapping.keys()))
                #linkage_to_plot(linkage_f, out_f, names) # TO DO: finish
    else:  # file as input
        path, file_name = os.path.split(path_to_input)
        file_name, ext = os.path.splitext(file_name)
        linkage_f = os.path.join(out_dir, file_name + '.ped')
        out_f = os.path.join(out_dir, file_name + ".png")
        if path_to_input.endswith(
                '.uio'):  # TO DO: load pedigree obj from .pkl
            pedigree = rel_to_linkage(path_to_input, nlp, out_dir)
            names = ",".join(list(pedigree.id_mapping.keys()))
            linkage_to_plot(linkage_f, out_f, names)
        elif file_name == 'example1_gold':  # get a gold standard plot
            id_mapping = {
                'pasient': 1,
                'mor': 2,
                'far': 5,
                'farmor': 6,
                'farfar': 7,
                'farbror': 8,
                'søster': 9,
                'fetter': 10,
                'fetter2': 11,
                'tante': 12,
                'bror': 13,
                'barn': 14,
                'partner': 15,
                'barn2': 16
            }
            names = ",".join(list(id_mapping.keys()))
            linkage_to_plot(linkage_f, out_f, names)
        elif path_to_input.endswith('.ped'):
            print('in gen')
            linkage_to_plot(linkage_f, out_f)
            # TO DO: parse additional arg with names?
        else:
            raise ValueError(
                'Input file(s) must have the extension .ped or .uio')
예제 #15
0
def get_doc(language="ar", size=1):
    PATH = "/Users/abdulrahimqaddoumi/Desktop/" + language
    clean_texts = generate_clean_text(PATH)
    spacy_udpipe.download(language)
    nlp = spacy_udpipe.load(language)
    text_length = len(clean_texts)
    hundredth = text_length // 100
    start_time = time.time()
    for i in range(size):
        print(i * hundredth, (1+i) * hundredth)
    doc = nlp(clean_texts[:100000])
    print("--- %s seconds ---" % (time.time() - start_time))
    return doc
예제 #16
0
def get_udpipe_parser(lang='ru'):
    global udpipe_nlp
    if udpipe_nlp.get(lang, None) is None:
        try:

            spacy_udpipe.download(lang)  # download Russian model

            udpipe_nlp[lang] = spacy_udpipe.load("ru")
            # nlp.add_pipe(nlp.create_pipe('sentencizer'))

        except:
            print(f'error loading udpipe model for {lang}')

        # parser_nlp.add_pipe(parser_nlp.create_pipe('sentencizer'))

    return udpipe_nlp[lang]
예제 #17
0
def compute_gram_diversity(sentences,
                           lang="en",
                           system_name="",
                           freq_voc=None):
    ''' Computing metric

        :param metric_func: get_bleu or get_ter_multeval
        :param sys: the sampled sentences from the translation
        :param sample_idxs: indexes for the sample (list)
        :param iters: number of iterations
        :returns: a socre (float)
    '''
    nlpD = spacy_udpipe.load(lang).tokenizer
    nlpD.max_length = 300000000

    lemmas = get_lemmas(sentences, nlpD, system_name, freq_voc)

    return (compute_simpDiv(lemmas), compute_invSimpDiv(lemmas),
            compute_shannonDiv(lemmas))
예제 #18
0
def test_spacy_udpipe(lang: str) -> None:
    nlp = load(lang=lang)

    text = "Attention aux articles contractés!"
    doc = nlp(text=text)

    assert [t.orth_ for t in doc
            ] == ["Attention", "à", "les", "articles", "contractés", "!"]

    pos = [{"INTJ", "NOUN"}, {"ADP"}, {"DET"}, {"NOUN"}, {"VERB", "ADJ"},
           {"PUNCT"}]
    for i, t in enumerate(doc):
        assert t.pos_ in pos[i]

    assert [t.head.i for t in doc] == [0, 3, 3, 0, 3, 0]

    dep = [{"ROOT", "root"}, {"case"}, {"det"}, {"nmod", "obl", "obl:arg"},
           {"acl", "amod"}, {"punct"}]
    for i, t in enumerate(doc):
        assert t.dep_ in dep[i]
예제 #19
0
def main():
    docs_dir = '/home/zal/Devel/OperaSpNLP/docs'
    output_dir = '/home/zal/Devel/OperaSpNLP/output/udpipe/pos'
    file_ext = 'txt'

    nlp = spacy_udpipe.load('es-ancora')
    neuralcoref.add_to_pipe(nlp)

    pos_dict = defaultdict(list)

    for file_path in glob(osp.join(docs_dir, f'*.{file_ext}')):
        doc_text = unidecode(open(file_path, 'r').read())
        doc = nlp(doc_text)
        for token in doc:
            pos_dict[token.pos_].append(token.text)

    for pos_tag in pos_dict.keys():
        print(f'{pos_tag}: {len(pos_dict[pos_tag])}')
        save_path = osp.join(output_dir, f'{pos_tag}.out')
        with open(save_path, 'w') as fout:
            fout.write('\n'.join([word for word in pos_dict[pos_tag]]))
예제 #20
0
import spacy_udpipe
import pandas as pd
from pprint import pprint
import pickle

spacy_udpipe.download("en")  # download English model
nlp = spacy_udpipe.load("en")

#text = "Wikipedia is a free online encyclopedia, created and edited by volunteers around the world. Das ist ein zweiter Satz."
#doc = nlp(text)
#sentences = [sent.string.strip() for sent in doc.sents]
#print(sentences)

# with open('../bert_final/reviews_as_raw_text.txt') as fopen:
#     reviews = fopen.read().split('\n')[:-1]
# #print(reviews)
# df = pd.DataFrame({"review_text":reviews})
#
# #print(df)
#
# testliste = []
# testliste2 = []
# for index, review in enumerate(df["review_text"]):
#     rev_doc = nlp(review)
#     testliste2 = ([sent.string.strip() for sent in rev_doc.sents])
#     #print(testliste2)
#     testliste.append(testliste2)
# pprint(testliste)

# with open('outfile', 'wb') as fp:
#     pickle.dump(testliste, fp)
import os
import re
import stanza
import spacy_udpipe

EXTERNAL_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                            'external_data')
nlp_udpipe = spacy_udpipe.load(lang="hy")
nlp_stanza = stanza.Pipeline(use_gpu=False,
                             lang='hy',
                             processors='tokenize, mwt, pos, lemma, depparse')


def lemmatizer(text: str):
    doc = nlp_stanza(text)
    return [
        word.lemma for sentence in doc.sentences for word in sentence.words
    ]


def pos_tagger(text: str):
    doc = nlp_stanza(text)
    return [word.pos for sentence in doc.sentences for word in sentence.words]


def word_tokenize(text: str, remove_punctuation=False):
    text = remove_punct(text) if remove_punctuation else text
    doc = nlp_udpipe(text)
    return [word.text for word in doc]

예제 #22
0
def init_parser(
    parser: str = "spacy",
    model_or_lang: str = "en",
    *,
    is_tokenized: bool = False,
    disable_sbd: bool = False,
    parser_opts: Optional[Dict] = None,
    **kwargs,
) -> Language:
    """Initialise a spacy-wrapped parser given a language or model and some options.
    :param parser: which parser to use. Parsers other than 'spacy' need to be installed separately. Valid options are
           'spacy', 'stanfordnlp', 'stanza', 'udpipe'. Note that the spacy-* wrappers of those libraries need to be
           installed, e.g. spacy-stanza. Defaults to 'spacy'
    :param model_or_lang: language model to use (must be installed). Defaults to an English model
    :param is_tokenized: indicates whether your text has already been tokenized (space-seperated). For stanza and
           stanfordnlp, this will also cause sentence segmentation *only* to be done by splitting on new lines.
           See the documentation: https://stanfordnlp.github.io/stanfordnlp/tokenize.html
           See the documentation: https://stanfordnlp.github.io/stanza/tokenize.html
    :param disable_sbd: disables spaCy automatic sentence boundary detection (only works for spaCy)
    :param parser_opts: will be passed to the core pipeline. For spacy and udpipe, it will be passed to their
           `.load()` initialisations, for stanfordnlp and stanza `pipeline_opts` is passed to to their `.Pipeline()`
           initialisations
    :param kwargs: options to be passed to the ConllFormatter initialisation
    :return: an initialised Language object; the parser
    """
    parser_opts = {} if parser_opts is None else parser_opts

    if parser == "spacy":
        nlp = spacy.load(model_or_lang, **parser_opts)
        if is_tokenized:
            nlp.tokenizer = SpacyPretokenizedTokenizer(nlp.vocab)
        if disable_sbd:
            nlp.add_pipe(_prevent_sbd, name="prevent-sbd", before="parser")
    elif parser == "stanfordnlp":
        from spacy_stanfordnlp import StanfordNLPLanguage
        import stanfordnlp

        snlp = stanfordnlp.Pipeline(lang=model_or_lang,
                                    tokenize_pretokenized=is_tokenized,
                                    **parser_opts)
        nlp = StanfordNLPLanguage(snlp)
    elif parser == "stanza":
        import stanza
        from spacy_stanza import StanzaLanguage

        snlp = stanza.Pipeline(lang=model_or_lang,
                               tokenize_pretokenized=is_tokenized,
                               **parser_opts)
        nlp = StanzaLanguage(snlp)
    elif parser == "udpipe":
        import spacy_udpipe

        nlp = spacy_udpipe.load(model_or_lang, **parser_opts)
    else:
        raise ValueError(
            "Unexpected value for 'parser'. Options are: 'spacy', 'stanfordnlp', 'stanza', 'udpipe'"
        )

    conllformatter = ConllFormatter(nlp, **kwargs)
    nlp.add_pipe(conllformatter, last=True)

    return nlp
예제 #23
0
znaki=[",","/",".","'",'"',"&","!","?",":",";","«","»"]
for word in spl_text:
  for z in znaki:
    if z in word:
      word = word.replace(z, "")
  pre_text.append(word)
print(pre_text)

!pip install spacy-udpipe

!pip install pymorphy2

import spacy_udpipe

spacy_udpipe.download("ru")
nlp = spacy_udpipe.load("ru")

text = "на столе стоит протекшая банка"
doc = nlp(text)
for token in doc:
    print(token.text, token.lemma_, token.pos_, token.dep_)

d = nlp(txt)

yo_true = 0
yo_missed = 0
tags = {'ADJS': 'ADJ', 'ADJF': 'ADJ', 'PRTF': 'VERB', 'PRTS': 'VERB', 'NOUN': 'NOUN'}
ts = []

for i in pre_text:
  if 'ё' in str(i):
예제 #24
0
@author: chenfish
"""

# MDD

import spacy_udpipe
from nltk.tokenize import word_tokenize

#mockup, list of sents
training_data = [
    'Churkin said, that the UN Security, Council meeting on Crimea was useful.'
]

#load the UD model of English
ud_model = spacy_udpipe.load("en")

sent_pos = []
sent_ud = []
sent_head = []

# get pos and ud tag
for line in training_data:

    #print(line)
    temp_pos = []
    temp_ud = []
    temp_head = []
    #print(line)
    tag_sent = ud_model(line)
    for i, token in enumerate(tag_sent):
예제 #25
0
print("Stanza model initialization ends")

print("SpaCy model initialization starts")
spacy_en = spacy.load("en_core_web_sm")
spacy_zh = spacy.load("zh_core_web_sm")
spacy_es = spacy.load("es_core_news_sm")
spacy_ja = spacy.load("ja_core_news_sm")
spacy_de = spacy.load("de_core_news_sm")
spacy_fr = spacy.load("fr_core_news_sm")
spacy_it = spacy.load("it_core_news_sm")
spacy_nl = spacy.load("nl_core_news_sm")
spacy_pt = spacy.load("pt_core_news_sm")
print("SpaCy model initialization ends")

print("UDpipe model initialization starts")
udpipe_en = spacy_udpipe.load("en")
udpipe_zh = spacy_udpipe.load("zh")
udpipe_es = spacy_udpipe.load("es")
udpipe_ja = spacy_udpipe.load("ja")
udpipe_de = spacy_udpipe.load("de")
udpipe_fr = spacy_udpipe.load("fr")
udpipe_it = spacy_udpipe.load("it")
udpipe_nl = spacy_udpipe.load("nl")
udpipe_pt = spacy_udpipe.load("pt")
udpipe_ar = spacy_udpipe.load("ar")
udpipe_ru = spacy_udpipe.load("ru")
print("UDpipe model initialization ends")

model_lang_map["spacy"] = {
    "eng": spacy_en,
    "cmn": spacy_zh,
예제 #26
0
 def __init__(self, language='fi-tdt'):
     self.name = f'UDPipe-{language}'
     self.nlp = spacy_udpipe.load(language)
예제 #27
0
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English
from tqdm import trange
import spacy_udpipe


import collections
import json

nlp = spacy_udpipe.load('en')
# tokenizer = Tokenizer(nlp.vocab)


def load_json(js_path,nlp,output):
    with open(js_path) as fi :
        with open(output,'w') as fo:
            line = fi.readline().strip()
            while line:
                mrp = json.loads(line)
                if 'input' in mrp:
                    tokens, lemmas, pos = [], [], []
                    for t in nlp(mrp['input']):
                        tokens.append(t.text)
                        lemmas.append(t.lemma_)
                        pos.append(t.pos_)
                        companions = []
                        # length = 0
                        init = 0
                    for idx,token in enumerate(tokens):
                        begin = mrp['input'][init:].find(token)
                        # print(mrp['input'][init+begin:init+begin+len(token)])
예제 #28
0
 def __init__(self, language='ru', add_postags=True, vocabulary=None):
     self.pipeline = spacy_udpipe.load(language)
     self.add_postags = add_postags
     self.vocabulary = vocabulary
예제 #29
0
def init_parser(
    model_or_lang: str,
    parser: str,
    *,
    is_tokenized: bool = False,
    disable_sbd: bool = False,
    exclude_spacy_components: Optional[List[str]] = None,
    parser_opts: Optional[Dict] = None,
    **kwargs,
) -> Language:
    """Initialise a spacy-wrapped parser given a language or model and some options.
    :param model_or_lang: language model to use (must be installed for spaCy but will be automatically downloaded for
           stanza and UDPipe)
    :param parser: which parser to use. Parsers other than 'spacy' need to be installed separately. Valid options are
           'spacy', 'stanza', 'udpipe'. Note that the spacy-* wrappers of those libraries need to be
           installed, e.g. spacy-stanza
    :param is_tokenized: indicates whether your text has already been tokenized (space-seperated). When using 'spacy',
           this option also disabled sentence segmentation completely. For stanza, sentence segmentation will *only*
           be done by splitting on new lines.

           See the stanza documentation for more:
           https://stanfordnlp.github.io/stanza/tokenize.html#start-with-pretokenized-text

           This option does not affect UDPipe.
    :param disable_sbd: disables automatic sentence boundary detection in spaCy and stanza. For stanza, make sure that
           your input is in the correct format, that is: sentences must be separated by two new lines. If you want to
           disable both tokenization and sentence segmentation in stanza, do not enable this option but instead only
           use `is_tokenized` and make sure your sentences are separated by only one new line.

           See the stanza documentation for more:
           https://stanfordnlp.github.io/stanza/tokenize.html#tokenization-without-sentence-segmentation

           This option does not affect UDPipe.
    :param exclude_spacy_components: spaCy components to exclude from the pipeline, which can greatly improve
           processing speed. Only works when using spaCy as a parser.
    :param parser_opts: will be passed to the core pipeline. For spacy, it will be passed to its
           `.load()` initialisations, for stanza `pipeline_opts` is passed to its `.load_pipeline()`
           initialisations. UDPipe does not have any keyword arguments
    :param kwargs: options to be passed to the ConllFormatter initialisation
    :return: an initialised Language object; the parser
    """
    parser_opts = {} if parser_opts is None else parser_opts

    if parser == "spacy":
        exclude = ["senter", "sentencizer"
                   ] if disable_sbd or is_tokenized else []
        exclude = exclude + exclude_spacy_components if exclude_spacy_components is not None else exclude
        nlp = spacy.load(model_or_lang, exclude=exclude, **parser_opts)
        if is_tokenized:
            nlp.tokenizer = SpacyPretokenizedTokenizer(nlp.vocab)
        if disable_sbd or is_tokenized:
            try:
                nlp.add_pipe("disable_sbd", before="parser")
            except ValueError:
                nlp.add_pipe("disable_sbd", first=True)
    elif parser == "stanza":
        import spacy_stanza  # noqa: F811
        import stanza

        verbose = parser_opts.pop("verbose", False)
        stanza.download(model_or_lang, verbose=verbose)
        nlp = spacy_stanza.load_pipeline(
            model_or_lang,
            verbose=verbose,
            tokenize_no_ssplit=disable_sbd,
            tokenize_pretokenized=is_tokenized,
            **parser_opts,
        )
    elif parser == "udpipe":
        import spacy_udpipe  # noqa: F811

        spacy_udpipe.download(model_or_lang)
        nlp = spacy_udpipe.load(model_or_lang)
    else:
        raise ValueError(
            "Unexpected value for 'parser'. Options are: 'spacy', 'stanza', 'udpipe'"
        )

    nlp.add_pipe("conll_formatter", config=kwargs, last=True)

    return nlp
예제 #30
0
 def _get_model(self, iso):
     import spacy_udpipe
     nlp = spacy_udpipe.load(iso)
     return nlp