def __init__(self, nlp): self.nlp = nlp # self._endpoint = os.getenv("AZ_TA_FOR_HEALTH_ENDPOINT") stanza.download("en", package="mimic", processors={"ner": "i2b2"}) self._analyzer = stanza.Pipeline( "en", package="mimic", processors={"ner": "i2b2"} )
def __init__(self, lang='en'): import stanza try: self.pipeline = stanza.Pipeline(lang=lang, processors='tokenize', verbose=False, tokenize_no_ssplit=True) except Exception: stanza.download(lang=lang, resources_url='stanford') self.pipeline = stanza.Pipeline(lang=lang, processors='tokenize', verbose=False, tokenize_no_ssplit=True)
def _download_model(self) -> None: """Interface with the `stanza` model downloader.""" if not self.interactive: if not self.silent: print( f"CLTK message: Going to download required Stanza models to ``{self.model_path}`` ..." ) # pragma: no cover stanza.download(lang=self.stanza_code, package=self.treebank) else: print( # pragma: no cover "CLTK message: This part of the CLTK depends upon the Stanza NLP library." ) # pragma: no cover dl_is_allowed = query_yes_no( f"CLTK message: Allow download of Stanza models to ``{self.model_path}``?" ) # type: bool if dl_is_allowed: stanza.download(lang=self.stanza_code, package=self.treebank) else: raise CLTKException( f"Download of necessary Stanza model declined for '{self.language}'. Unable to continue with Stanza's processing." ) # if file model still not available after attempted DL, then raise error if not file_exists(self.model_path): raise FileNotFoundError( "Missing required models for ``stanza`` at ``{0}``.".format( self.model_path ) )
def test_spacy_stanza_german(): lang = "de" stanza.download(lang) snlp = stanza.Pipeline(lang=lang) nlp = StanzaLanguage(snlp) with pytest.warns(UserWarning): doc = nlp("Auf dem Friedhof an der Straße Am Rosengarten")
def create_model(vectors_loc=None, lang=None, stz=True, vectors_name='fasttext', max_items=-1): if lang is None or lang == 'sv' and not stz: nlp = Swedish() elif not stz: nlp = spacy.blank(lang) elif stz: stanza.download(lang) snlp = stanza.Pipeline(lang=lang) nlp = StanzaLanguage(snlp) with open(vectors_loc, 'rb') as file_: logger.info("Reading file '{}'".format(vectors_loc)) header = file_.readline() nr_row, nr_dim = header.split( ) # the first line is number of tokens and dimensions counter = 0 nlp.vocab.reset_vectors(width=int(nr_dim)) for line in file_: if counter % 100 == 0: logger.info(counter) if counter == max_items: break counter = counter + 1 line = line.rstrip().decode('utf8') pieces = line.rsplit(' ', int(nr_dim)) word = pieces[0] vector = np.asarray([float(v) for v in pieces[1:]], dtype='f') nlp.vocab.set_vector(word, vector) # add the vectors to the vocab nlp.vocab.vectors.name = vectors_name # give vectors a name return nlp
def get_all_stanza_models() -> None: """Download language models, from the ``stanza`` project, that are supported by the CLTK or in scope. More here: `<https://stanfordnlp.github.io/stanza/models.html>_. TODO: Use CLTK stanza wrapper class to dlk files """ all_ud_models_for_cltk = dict( cop=["scriptorium"], cu=["proiel"], # OCS fro=["srcmf"], # Old French grc=["perseus", "proiel"], got=["proiel"], la=["ittb", "proiel", "perseus"], lzh=["kyoto"], ) # type: Dict[str, List[str]] stanford_dir = os.path.expanduser("~/stanza_resources/") # type: str for lang_name, model_sources in all_ud_models_for_cltk.items(): for model_source in model_sources: if lang_name == "cop": # Coptic errors our, for some reason, if we pass the package name ``scriptorium`` stanza.download(lang=lang_name, dir=stanford_dir) else: stanza.download(lang=lang_name, dir=stanford_dir, package=model_source)
def nlp(self, model): if model == "default": # A default Stanza NLP pipeline stanza.download(lang="en", processors="tokenize") if self.tokenize_by in ["sentence", "sentence_by_document"]: BASIC_STANZA_PIPELINE = stanza.Pipeline(processors="tokenize") else: BASIC_STANZA_PIPELINE = stanza.Pipeline( processors="tokenize", tokenize_no_ssplit=True) self._nlp = BASIC_STANZA_PIPELINE else: if self.tokenize_by in ["sentence", "sentence_by_document"]: if model.config["tokenize_no_ssplit"]: model.processors["tokenize"].config["no_ssplit"] = False model.config["tokenize_no_ssplit"] = False warn( "NLP does not have a sentencizer pipe; one has been added to tokenize by sentence." ) else: if not model.config["tokenize_no_ssplit"]: model.processors["tokenize"].config["no_ssplit"] = True model.config["tokenize_no_ssplit"] = True warn( "NLP contains a sentencizer pipe which has been removed to tokenize by document." ) self._nlp = model
def main(args): """Visualization of contexts, questions, and colored answer spans.""" # Load dataset, and optionally shuffle. dataset = QADataset(args, args.path) samples = dataset.samples if args.shuffle: random.shuffle(samples) vis_samples = samples[:args.samples] print() print('-' * RULE_LENGTH) print() # Visualize samples. for (qid, context, question, answer_start, answer_end) in vis_samples[:10]: cxt = _build_string(context) print(cxt) stanza.download('en') en_nlp = stanza.Pipeline('en') en_doc = en_nlp(cxt) for i, sent in enumerate(en_doc.sentences): print(f"[Sentence {i+1}") for word in sent.words: print("{:12s}\t{:12s}\t{:6s}\t{:d}\t{:12s}".format( word.text, word.lemma, word.pos, word.head, word.deprel)) print("") print("Mention text\tType\tStart-End") for ent in en_doc.ents: print("{}\t{}\t{}-{}".format(ent.text, ent.type, ent.start_char, ent.end_char))
def init_stanza_model(lang: str = "en"): try: snlp = stanza.Pipeline(lang) except ResourcesFileNotFoundError: stanza.download(lang) snlp = stanza.Pipeline(lang) return snlp
def __init__(self, dataset_name="squad", mode="train", data_limit=-1): super(NQGDataset, self).__init__() stanza.download('en') self.nlp = stanza.Pipeline('en', processors='tokenize,pos,ner') self.datatype = QAExample if dataset_name == "squad": self.datatype = SquadExample if mode == "train": datapath = SQUAD_TRAIN elif mode == "dev": datapath = SQUAD_DEV else: raise ValueError() self.ds = read_squad_dataset(datapath, limit=data_limit) elif dataset_name == "medquad": if mode == "train": datapath = MEDQUAD_TRAIN elif mode == "dev": datapath = MEDQUAD_DEV else: raise ValueError() self.ds = read_qa_dataset(datapath, limit=data_limit) elif dataset_name == "medqa_handmade": if mode == "test": datapath = MEDQA_HANDMADE_FILEPATH else: raise ValueError() self.ds = read_qa_dataset(datapath, limit=data_limit) else: raise NotImplementedError()
def __init__(self, lang='en'): self.lang = lang if not exists_file(home_dir() + '/stanza_resources/' + lang): stanza.download(lang) self.nlp = stanza.Pipeline(lang=lang, logging_level='WARN') ensure_path("out/") ensure_path("pics/")
def stanza_extract_entities(datafile, labels=LABELS): """Extract named entities from an EMu xml file using Stanza, returning a dictionary""" import stanza try: nlp = stanza.Pipeline('en') except ValueError: print('English model has not been downloaded. Downloading now.') stanza.download('en') nlp = stanza.Pipeline('en') futures = [] if datafile.endswith('.xml'): records = xml_iterator(datafile) else: records = csv_iterator(datafile) with ProcessPoolExecutor() as ex: for ident, lines in records: for line in lines: proc_line = nlp(line) futures.append( ex.submit(stanza_return_ents, ident, proc_line, labels=labels)) ent_data = reconcile_entities(futures) return (ent_data)
def download(): import os from pathlib import Path as path HOME_DIR = str(path.home()) # check nltk_data availability, download if not available import nltk nltk_rsc = os.path.join(HOME_DIR, 'nltk_data') for required in [os.path.join('corpora', 'stopwords.zip'), os.path.join('taggers', 'averaged_perceptron_tagger.zip')]: if not os.path.exists(os.path.join(nltk_rsc, required)): print('downloading nltk: ', required[:-4]) nltk.download(os.path.basename(required)[:-4], quiet=True) # check stanza_data availability, download if not available import stanza stanza_rsc = os.path.join(HOME_DIR, 'stanza_resources/en/ner') for required in ['anatem.pt', 'bionlp13cg.pt', 'i2b2.pt', 'jnlpba.pt']: if not os.path.exists(os.path.join(stanza_rsc, required)): print('downloading stanza: ', required[:-3]) stanza.download('en', package='craft', processors={'ner': required[:-3]}, verbose=False) # check benepar_data availability, download if not available import benepar if not os.path.exists(os.path.join(nltk_rsc, 'models', 'benepar_en3')): print('downloading benepar: benepar_en3') benepar.download('benepar_en3')
def parser(sent): try: result = [] str2 = "".join(sent) global str1 global dependencyInstalled # Uncomment when running for the first tim if not dependencyInstalled: stanza.download('en') dependencyInstalled = True nlp = stanza.Pipeline('en', processors='tokenize,mwt,pos,lemma,depparse') count = 0 counter = 0 doc = nlp(str2) for i, sentence in enumerate(doc.sentences): list1 = [] list2 = [] list3 = [] list4 = [] for word in sentence.words: if (word.deprel == "nsubj"): count += 1 if (count <= 1): list1.append([word.deprel,word.text,word.id,sentence.words[word.head - 1].text,word.head,word.xpos]) list4 = sentenceBuilder(list1[count-1], list2,list3) if (word.deprel != "nsubj" and word.deprel != "punct" and word.deprel != "mark"): list1.append([word.deprel, word.text, word.id, sentence.words[word.head - 1].text, word.head, word.xpos]) list4 = sentenceBuilder(list1[counter - 1], list2,list3) list4 = [ele for ele in list4 if ele!=[]] for x in list4: str1 = ' '.join(x) result.append(str1) return result except: print("Nothing passed into parameters")
def main(args): print(args, file=sys.stderr, flush=True) stanza.download(args.lang, dir=args.model_dir) kwargs = get_stanza_kwargs(args) pipeline = stanza.Pipeline(**kwargs) def run_stanza(lines): if args.depparse: doc = pipeline([l.split() for l in lines]) for sent in doc.sentences: print(' '.join(str(word.head) for word in sent.words)) elif args.tokenize: doc = pipeline(lines) for sent in doc.sentences: print(' '.join(str(word.text) for word in sent.words)) with fileinput.input(files=[args.input]) as f: print('| ', end='', file=sys.stderr, flush=True) batch = [] for i, line in enumerate(f, start=1): batch.append(line.strip()) if i % args.batch_size == 0: run_stanza(batch) print('{}...'.format(i), end='', file=sys.stderr, flush=True) batch = [] if len(batch) > 0: print(i, file=sys.stderr) run_stanza(batch) print('| processed sentences: {}'.format(i), file=sys.stderr)
def _download_model(self) -> None: """Interface with the `stanza` model downloader. # >>> stanza_wrapper = StanzaWrapper(language='grc', stanza_debug_level="INFO") # >>> stanza_wrapper._download_model() # True """ # TODO: Add prompt whether to allow stanza to download files # prompt user to DL the get_stanza_models models print("") # pragma: no cover print("") # pragma: no cover print("Α" * 80) # pragma: no cover print("") # pragma: no cover print( # pragma: no cover "CLTK message: The part of the CLTK that you are using depends upon the Stanza NLP library (`stanza`). What follows are several question prompts coming from it. (More at: <https://github.com/stanfordnlp/stanza>.) Answer with defaults." ) # pragma: no cover print("") # pragma: no cover print("Ω" * 80) # pragma: no cover print("") # pragma: no cover print("") # pragma: no cover stanza.download(lang=self.language, package=self.treebank) # if file model still not available after attempted DL, then raise error if not file_exists(self.model_path): raise FileNotFoundError( "Missing required models for ``stanza`` at ``{0}``.".format( self.model_path))
def tokenize(self, lang = None, text = None, case_id = None, article_id = None, use_gpu=True): ## turn lang into isocode alpha 2 lang = pycountry.languages.lookup(lang).alpha_2 ## check if the language model exists, if not download it if not "/home/jmr/stanza_resources/" + lang in self.stanza_models: stanza.download(lang) ## start the stanza pipeline for tokenizing with sentence segmentation as well as pos-tagging nlp = stanza.Pipeline(lang=lang, processors='tokenize,mwt,pos', tokenize_no_ssplit=False) ## tokenize # minibatch at the paragraph level by spliting the text with "\n\n" as per stanza usage # significant speed improvements after some experimentation... mini_batch = text.replace("\n", "\n\n").strip() doc = nlp(mini_batch) # for each sentence, get all tokens as well as their pos-tag and other morphological info dict_list = [] for i, sentence in enumerate(doc.sentences): for j, current_word in enumerate(sentence.words): cur_dict = current_word.to_dict() cur_dict['token_id'] = "_".join([case_id, str(i + 1), str(j + 1)]) dict_list.append(cur_dict) ## turn dict to df df_raw = pd.DataFrame(dict_list).drop(columns = "id").add_prefix("token_") df_raw["case_id"] = case_id if isinstance(article_id, str): df_raw['article_id'] = article_id return df_raw
def apply_stanza_processors(*corpus_names, path='/content/gdrive/My Drive/NMT/corpuses/iwslt16_en_de/', src_lang="de", trg_lang="en", _start=1, num=None, doc_size=10000, tok_bsz=64, mwt_bsz=200, pos_bsz=10000): stanza.download(lang=src_lang, processors='tokenize,mwt,pos') stanza.download(lang=trg_lang, processors='tokenize,pos') src_processor = stanza.Pipeline(lang=src_lang, processors='tokenize,mwt,pos', tokenize_no_ssplit=True, tokenize_batch_size=tok_bsz, mwt_batch_size=mwt_bsz, pos_batch_size=pos_bsz) trg_processor = stanza.Pipeline(lang=trg_lang, processors='tokenize,pos', tokenize_no_ssplit=True, tokenize_batch_size=tok_bsz, pos_batch_size=pos_bsz) corpuses = read_corpuses(*corpus_names, path=path, _start=_start, num=num) # store the number of pieces that each corpus's stanza processor outputs # will be stored across (so know how many pickle files to merge together # in downstream processing) num_corpus_pieces = {corpus_name:ceil(len(corpuses[corpus_name]) / doc_size) for corpus_name in corpuses} print(f"pieces: {num_corpus_pieces}\n") stanza_path = path + 'stanza_outputs/' dump(num_corpus_pieces, open(f"{stanza_path}num_corpus_pieces.pkl", 'wb')) for corpus_name in corpuses: if is_src_corpus(corpus_name): apply_stanza_processor(corpus_name, corpuses[corpus_name], src_processor, path=stanza_path, doc_size=doc_size) else: apply_stanza_processor(corpus_name, corpuses[corpus_name], trg_processor, path=stanza_path, doc_size=doc_size) print("done.")
def stanza_pipeline(lenguaje, procesadores='tokenize, pos, lemma', modelo_lemas='', modelo_ner='', modelo_pos=''): """ Carga y retorna un pipeline, o flujo de trabajo, de Stanza del y lenguaje y con los procesos \ especificados por el usuario. Los procesos que el usuario puede elegir añadir al pipeline incluyen \ tokenización, *Part of Speech* (POS), lematización y *Named Entity Recognition* (NER), entre otros. \ Para mayor información sobre estos modelos y los pipelines se puede consultar la página web \ de Stanza (https://stanfordnlp.github.io/stanza/pipeline.html#processors). :param lenguaje: (str). Lenguaje para el que se desean cargar los modelos de Stanza. Stanza tiene modelos \ disponibles para varios lenguajes, dependiendo de la función a realizar. Para mayor información, visitar \ https://stanfordnlp.github.io/stanza/available_models.html :param procesadores: (str). Valor por defecto: 'tokenize, pos, lemma'. Lista de procesadores, también \ entendidos como procesos o tareas que se desean aplicar a un texto de entrada, que se desean incluir \ en el pipeline. Se ingresa un string en el que los diferentes procesadores van separados por comas. :param modelo_lemas: (str). Valor por defecto: ''. Unicación de un archivo que contenga el modelo o procesador \ que el usuario desea utilizar para aplicar lematización a los textos. Si este parámetro se deja vacío, se \ utilizará el procesador disponible de la librería Stanza para el lenguaje especificado. :param modelo_ner: (str). Valor por defecto: ''. Unicación de un archivo que contenga el modelo o procesador \ que el usuario desea utilizar para aplicar *Named Entity Recognition* a los textos. Si este parámetro se deja \ vacío, se utilizará el procesador disponible de la librería Stanza para el lenguaje especificado. :param modelo_pos: (str). Valor por defecto: ''. Unicación de un archivo que contenga el modelo o procesador \ que el usuario desea utilizar para aplicar *Part of Speech* a los textos. Si este parámetro se deja vacío, se \ utilizará el procesador disponible de la librería Stanza para el lenguaje especificado. :return: (stanza Pipeline). Pipeline de Stanza, del lenguaje especificado, con los procesadores determinados por \ el usuario. Si los modelos requeridos no están disponibles en el computador del usuario, la función descargará \ los modelos correspondientes, lo cual puede tardar algunos minutos dependiendo del tamaño de los modelos y la \ velocidad de conexión a internet del usuario. """ # Configuración básica del pipeline config = { 'processors': procesadores, 'lang': lenguaje, } # Si se añade algún modelo custom, se agrega al diccionario if modelo_pos != '': config['pos_model_path'] = modelo_pos if modelo_lemas != '': config['lemma_model_path'] = modelo_lemas if modelo_ner != '': config['ner_model_path'] = modelo_ner # Intentar crear pipeline. Si el modelo no está descargado, se descarga # primero try: nlp_pipe = stanza.Pipeline( **config, verbose=0, ) except BaseException: print( '[INFO] Descargando modelo. Este proceso puede tardar varios minutos.\n' ) stanza.download(lenguaje) nlp_pipe = stanza.Pipeline(**config, verbose=0) # Retornar pipeline return nlp_pipe
def set_language(self, lang=None): self.lang = lang if lang is None: return if not exists_file(home_dir() + '/stanza_resources/' + lang): stanza.download(lang) self.nlp = NLP(lang=lang, logging_level='ERROR')
def _download_ru_stanfordnlp_model(): import stanza if not pathlib.Path.home().joinpath( 'stanza_resources/resources.json').exists(): stanza.download('ru') else: logging.warning(f'ru language model already exist. Skip download.')
def __init__(self, lang="en"): self.lang = lang try: nlp = stanza.Pipeline(lang=self.lang) except: stanza.download(lang) nlp = stanza.Pipeline(lang=self.lang) self.nlp = nlp
def __init__(self, context, wordService): self.__wordService = wordService self.__context = context stanza.download('en') stanza.download('pl') self.stanza_en = stanza.Pipeline(lang='en', processors='tokenize,mwt,pos,lemma') self.stanza_pl = stanza.Pipeline(lang='pl', processors='tokenize,mwt,pos,lemma')
def test_spacy_stanza_german(): lang = "de" stanza.download(lang) nlp = spacy_stanza.load_pipeline(lang) assert nlp.Defaults == GermanDefaults # warning for misaligned ents due to multi-word token expansion with pytest.warns(UserWarning): doc = nlp("Auf dem Friedhof an der Straße Am Rosengarten")
def test_spacy_stanza_tokenizer_options(): # whitespace tokens from spacy tokenizer are handled correctly lang = "en" stanza.download(lang) nlp = spacy_stanza.load_pipeline(lang, processors={"tokenize": "spacy"}) doc = nlp(" Barack Obama was born\n\nin Hawaii.") assert [t.text for t in doc] == [ " ", "Barack", " ", "Obama", " ", "was", " ", "born", "\n\n", "in", "Hawaii", ".", ] # pretokenized text is handled correctly nlp = spacy_stanza.load_pipeline(lang, tokenize_pretokenized=True) doc = nlp( "Barack Obama was born in Hawaii.\nBarack Obama was born in Hawaii.") assert [t.text for t in doc] == [ "Barack", "Obama", "was", "born", "in", "Hawaii.", "Barack", "Obama", "was", "born", "in", "Hawaii.", ] doc = nlp( " Barack Obama was born\n\n in Hawaii.\nBarack Obama was born in Hawaii." ) assert [t.text for t in doc] == [ "Barack", "Obama", "was", "born", "in", "Hawaii.", "Barack", "Obama", "was", "born", "in", "Hawaii.", ]
def main(args=None): torch.multiprocessing.set_start_method('fork') torch.set_num_threads(1) torch.set_num_interop_threads(1) stanza.download('en') run_processor(StanzaSelectiveParser(), mp=True, mp_context=torch.multiprocessing, args=args)
def __init__(self): print('Init NLPUtils') nltk.download('punkt') nltk.download('stopwords') nltk.download('wordnet') stanza.download('es') self.nlp = stanza.Pipeline('es') self.pattern = re.compile(r'(\#\w+)') self.tweet_tokenizer = TweetTokenizer()
def update_config_value(self, name, old_value, new_value): if old_value == 'install_resource': if new_value == 'y': stanza.download('fr') else: raise Exception( 'you can not run this package without the resources') else: self.config['package'] = new_value
def __init__(self): super(StanzaAnalyzer, self).__init__() stanza.download("pl") self._nlp_pipeline = stanza.Pipeline( "pl", processors="tokenize,pos,lemma", verbose=True, use_gpu=True) # initialize neural pipeline self._conv_stanza_pos = lambda x: [w.pos for w in x.words]
def parse_data(data_file=config.DEV_MA_FILE, target=config.PDEV_MA_FILE, function_test=False, force_exe=False): """ input (data = str, embedding = str, target file = str) effect preprocess and save data to target ouput preprocessed data parsed data is in jsonl (each line is a json) { config.idf : id(in string) config.hf : Stanza Doc, config.pf : Stanza Doc, config.lf : int } """ # alias p = config.pf h = config.hf l = config.lf # stanza dinit stanza.download('en') nlp = stanza.Pipeline(lang='en', processors='tokenize,mwt,pos,lemma,depparse') # data_file_loading with open(data_file) as fo: raw_lines = fo.readlines() json_data = [json.loads(line) for line in raw_lines] if function_test: json_data = json_data[:10] if os.path.isfile(str(target)) and not force_exe: print("file " + str(target) + " already exist") print( "if u still want to procceed, add force_exe=True in function arg") print("exiting") return None else: print("creating file " + str(target) + " to save result") print("executing") # dependency parsing and jsonl saving with jsonl.open(target, mode='w') as writer: parsed_data = [] for data in tqdm(json_data): # only add those who have gold labels if (data[l] not in config.label_to_id.keys()): continue pdata = process_one_example(data, nlp) parsed_data.append(pdata) writer.write(pdata) return parsed_data