def test_backpointer(): nlp = stanza.Pipeline(dir=TEST_MODELS_DIR, lang='en') doc = nlp(EN_DOC2) ent = doc.ents[0] assert ent.sent is doc.sentences[0] assert list(doc.iter_words())[0].sent is doc.sentences[0] assert list(doc.iter_tokens())[-1].sent is doc.sentences[-1]
def __init__(self, port=9001): self.nlp = stanza.Pipeline('en') # initialize English neural pipeline self.client = CoreNLPClient( annotators=['tokenize', 'ssplit', 'pos', 'lemma', 'parse'], timeout=60000, memory='4G', endpoint=f'http://localhost:{port}')
def test_readonly(): Document.add_property('some_property', 123) nlp = stanza.Pipeline(dir=TEST_MODELS_DIR, lang='en') doc = nlp(EN_DOC) assert doc.some_property == 123 with pytest.raises(ValueError): doc.some_property = 456
def load_conll_data(file_path): instances = [] words = [] labels = [] with open(file_path, 'r') as fp: for line in fp: line = line.strip() if len(line) == 0: if len(words) != 0: instance = dict(words=words, labels=labels) instances.append(instance) words = [] labels = [] else: columns = line.split() words.append(columns[0]) labels.append(columns[-1]) if len(words) != 0: instance = dict(words=words, labels=labels) instances.append(instance) nlp = stanza.Pipeline(lang='en', tokenize_pretokenized=True) for instance in tqdm(instances): doc = nlp([instance['words']]) sentence = doc.sentences[0] dep_head = [''] * len(instance['words']) for i, word in enumerate(sentence.words): dep_head[i] = word.head - 1 instance['heads'] = dep_head return instances
def test_tokenize(): nlp = stanza.Pipeline(processors='tokenize', dir=TEST_MODELS_DIR, lang='en') doc = nlp(EN_DOC) assert EN_DOC_GOLD_TOKENS == '\n\n'.join( [sent.tokens_string() for sent in doc.sentences])
def test_no_ssplit(): nlp = stanza.Pipeline(**{'processors': 'tokenize', 'dir': TEST_MODELS_DIR, 'lang': 'en', 'tokenize_no_ssplit': True}) doc = nlp(EN_DOC_NO_SSPLIT) assert EN_DOC_NO_SSPLIT_SENTENCES == [[w.text for w in s.words] for s in doc.sentences] assert all([doc.text[token._start_char: token._end_char] == token.text for sent in doc.sentences for token in sent.tokens])
def test_jieba(): nlp = stanza.Pipeline(lang='zh', dir=TEST_MODELS_DIR, processors={'tokenize': 'jieba'}, package=None) doc = nlp(ZH_DOC) assert "JiebaTokenizer" == nlp.processors['tokenize']._variant.__class__.__name__ assert ZH_DOC_GOLD_TOKENS == '\n\n'.join([sent.tokens_string() for sent in doc.sentences]) assert all([doc.text[token._start_char: token._end_char] == token.text for sent in doc.sentences for token in sent.tokens])
def __load_pipeline(self): Globals = globals() Globals_keys = set(Globals.keys()) pipeline_vars = set(['s_nlp','s_nlp_lang','s_nlp_path']) is_loaded = pipeline_vars.issubset(Globals_keys) and Globals['s_nlp'] is not None if is_loaded: same_processors = set(Globals['s_nlp'].processors.keys()) == set(['tokenize', 'depparse', 'pos', 'lemma']) same_gpu_use = not Globals['s_nlp'].use_gpu same_lang = Globals['s_nlp_lang'] == self.lang same_path = Globals['s_nlp_path'] == self.stanza_path same_setup = same_lang and same_path and same_processors and same_gpu_use else: same_setup = False if not is_loaded or not same_setup: if self.__globalize_stanza: global s_nlp global s_nlp_lang global s_nlp_path s_nlp_lang = self.lang s_nlp_path = self.stanza_path s_nlp = stanza.Pipeline( lang = s_nlp_lang, dir = s_nlp_path, processors = "tokenize,lemma,pos,depparse") return s_nlp
def test_sudachipy_no_ssplit(): nlp = stanza.Pipeline(lang='ja', dir=TEST_MODELS_DIR, processors={'tokenize': 'sudachipy'}, tokenize_no_ssplit=True, package=None) doc = nlp(JA_DOC) assert "SudachiPyTokenizer" == nlp.processors['tokenize']._variant.__class__.__name__ assert JA_DOC_GOLD_NOSSPLIT_TOKENS == '\n\n'.join([sent.tokens_string() for sent in doc.sentences]) assert all([doc.text[token._start_char: token._end_char] == token.text for sent in doc.sentences for token in sent.tokens])
def _sent_split_corpus(self, arr_input_text): """ tokenize corpus given tokenizer by config file""" # arr_input_text = pd_input['essay'].values # num_over = 0 # total_sent = 0 import stanza # stanford library for tokenizer tokenizer_stanza = stanza.Pipeline('en', processors='tokenize', use_gpu=True) num_sents = [] sent_corpus = [] # tokenized to form of [doc, list of sentences] for cur_doc in arr_input_text: cur_doc = self._refine_text(cur_doc) # cur_doc: single string # sent_list = [sent.string.strip() for sent in spacy_nlp(cur_doc).sents] # spacy style ## stanza version doc_stanza = tokenizer_stanza(cur_doc) sent_list = [sentence.text for sentence in doc_stanza.sentences] ## normal version # sent_list = self.sent_tokenzier(cur_doc) # following exactly same way with previous works sent_corpus.append(sent_list) num_sents.append(len(sent_list)) return sent_corpus, num_sents
def main(args): """Visualization of contexts, questions, and colored answer spans.""" # Load dataset, and optionally shuffle. dataset = QADataset(args, args.path) samples = dataset.samples if args.shuffle: random.shuffle(samples) vis_samples = samples[:args.samples] print() print('-' * RULE_LENGTH) print() # Visualize samples. for (qid, context, question, answer_start, answer_end) in vis_samples[:10]: cxt = _build_string(context) print(cxt) stanza.download('en') en_nlp = stanza.Pipeline('en') en_doc = en_nlp(cxt) for i, sent in enumerate(en_doc.sentences): print(f"[Sentence {i+1}") for word in sent.words: print("{:12s}\t{:12s}\t{:6s}\t{:d}\t{:12s}".format( word.text, word.lemma, word.pos, word.head, word.deprel)) print("") print("Mention text\tType\tStart-End") for ent in en_doc.ents: print("{}\t{}\t{}-{}".format(ent.text, ent.type, ent.start_char, ent.end_char))
def stanza_gen(texts, lang, processors="tokenize,mwt,lemma,pos,depparse,ner", stanza_path=os.path.join(str(Path.home()), 'stanza_resources'), verbose=True, **kwargs): """ texts (iter): an iterator of strings lang (str): language code stanza_path (path): the path for saving stanza resources Examples: >>> sg = stanza_gen(texts = ["dette er en test text"], lang = "da",\ verbose=False) >>> type(sg) <class 'generator'> >>> sg_unpacked = list(sg) >>> type(sg_unpacked[0]) <class 'pandas.core.frame.DataFrame'> """ s_nlp = stanza.Pipeline(lang=lang, processors=processors, dir=stanza_path, verbose=verbose, **kwargs) for text in texts: doc = s_nlp(text) sent_ids = dict() sent_n = None def __get_ent(n_sent, sent, word): nonlocal sent_ids nonlocal sent_n if sent_n != n_sent: sent_ids = { word.id: ent.type for ent in sent.ents for word in ent.words } if word.id in sent_ids: return sent_ids[word.id] # extract from doc extr = ( ( n_sent, # sentence number word.text, word.lemma, word.upos, word.xpos, # pos-tags word.deprel, __get_ent(n_sent, sent, word)) for n_sent, sent in enumerate(doc.sentences) for word in sent.words) cols = [ "n_sent", "token", "lemma", "upos", "xpos", "dependency relation", "ner" ] yield pd.DataFrame(extr, columns=cols)
def apply_packaged_translation(pkg, input_text, translator, num_hypotheses=4): """Applies the translation in pkg to translate input_text. Args: pkg (Package): The package that provides the translation. input_text (str): The text to be translated. translator (ctranslate2.Translator): The CTranslate2 Translator num_hypotheses (int): The number of hypotheses to generate Returns: [Hypothesis]: A list of Hypothesis's for translating input_text """ info('apply_packaged_translation') sp_model_path = str(pkg.package_path / 'sentencepiece.model') sp_processor = spm.SentencePieceProcessor(model_file=sp_model_path) stanza_pipeline = stanza.Pipeline(lang=pkg.from_code, dir=str(pkg.package_path / 'stanza'), processors='tokenize', use_gpu=False, logging_level='WARNING') stanza_sbd = stanza_pipeline(input_text) sentences = [sentence.text for sentence in stanza_sbd.sentences] info('sentences', sentences) tokenized = [ sp_processor.encode(sentence, out_type=str) for sentence in sentences ] info('tokenized', tokenized) BATCH_SIZE = 32 assert (len(sentences) <= BATCH_SIZE) translated_batches = translator.translate_batch( tokenized, replace_unknowns=True, max_batch_size=BATCH_SIZE, beam_size=num_hypotheses, num_hypotheses=num_hypotheses, length_penalty=0.2) info('translated_batches', translated_batches) # Build hypotheses value_hypotheses = [] for i in range(num_hypotheses): translated_tokens = [] cumulative_score = 0 for translated_batch in translated_batches: translated_tokens += translated_batch[i]['tokens'] cumulative_score += translated_batch[i]['score'] detokenized = ''.join(translated_tokens) detokenized = detokenized.replace('▁', ' ') value = detokenized if len(value) > 0 and value[0] == ' ': # Remove space at the beginning of the translation added # by the tokenizer. value = value[1:] hypothesis = Hypothesis(value, cumulative_score) value_hypotheses.append(hypothesis) info('value_hypotheses', value_hypotheses) return value_hypotheses
def test_missing_requirements(): """ Try to build several pipelines with bad configs and check thrown exceptions against gold exceptions. :return: None """ # list of (bad configs, list of gold ProcessorRequirementsExceptions that should be thrown) pairs bad_config_lists = [ # missing tokenize ( # input config { 'processors': 'pos,depparse', 'dir': TEST_MODELS_DIR, 'lang': 'en' }, # 2 expected exceptions [{ 'processor_type': 'POSProcessor', 'processors_list': ['pos', 'depparse'], 'provided_reqs': set([]), 'requires': set(['tokenize']) }, { 'processor_type': 'DepparseProcessor', 'processors_list': ['pos', 'depparse'], 'provided_reqs': set([]), 'requires': set(['tokenize', 'pos', 'lemma']) }]), # no pos when lemma_pos set to True; for english mwt should not be included in the loaded processor list ( # input config { 'processors': 'tokenize,mwt,lemma', 'dir': TEST_MODELS_DIR, 'lang': 'en', 'lemma_pos': True }, # 1 expected exception [{ 'processor_type': 'LemmaProcessor', 'processors_list': ['tokenize', 'lemma'], 'provided_reqs': set(['tokenize', 'mwt']), 'requires': set(['tokenize', 'pos']) }]) ] # try to build each bad config, catch exceptions, check against gold pipeline_fails = 0 for bad_config, gold_exceptions in bad_config_lists: try: stanza.Pipeline(**bad_config) except PipelineRequirementsException as e: pipeline_fails += 1 assert isinstance(e, PipelineRequirementsException) assert len(e.processor_req_fails) == len(gold_exceptions) for processor_req_e, gold_exception in zip(e.processor_req_fails, gold_exceptions): # compare the thrown ProcessorRequirementsExceptions against gold check_exception_vals(processor_req_e, gold_exception) # check pipeline building failed twice assert pipeline_fails == 2
def __load_stanza_pipeline(self, model_folder: str, use_gpu: bool): logging.debug( "Starting loading the the Stanza models into the Pipeline!") self.stanza_pipeline = stanza.Pipeline(lang='et', processors='tokenize,pos,lemma', dir=model_folder, use_gpu=use_gpu) logging.debug("Finished loading the stanza models!")
def __initialize(): global NLP APP_DIR = os.environ.get('APP_DIR') try: NLP = stanza.Pipeline('es', verbose=False, use_gpu=False, dir=f'{APP_DIR}stanfordnlp_resources') except (ResourcesFileNotFoundError, FileNotFoundError) as ex: logging.info("Stanza: Descargando modelo 'es'") stanza.download('es', verbose=False, model_dir=f'{APP_DIR}stanfordnlp_resources') NLP = stanza.Pipeline('es', verbose=False, use_gpu=False, dir=f'{APP_DIR}stanfordnlp_resources')
def wrapper(self) -> list: nlp = stanza.Pipeline( lang=self._lang, processors=", ".join(self._processors), use_gpu=False, ) object_stanza = nlp(" ".join(self._document)) return json.loads(str(object_stanza)) # convert object stanza to object json.
def test_register_processor_variant(): nlp = stanza.Pipeline(dir=TEST_MODELS_DIR, lang='en', processors={"tokenize": "lol"}, package=None) doc = nlp(EN_DOC) assert EN_DOC_LOL_TOKENS == '\n\n'.join(sent.tokens_string() for sent in doc.sentences)
def initStanzaPipeline(lang): downloadStanza(lang) global snlpInitialized global nlpStanza if not snlpInitialized: snlp = stanza.Pipeline(lang=lang) nlpStanza['snlp'] = StanzaLanguage(snlp) snlpInitialized = True
def stanza_pos_fct(sent_tok: list): # uses batches nlp_stanza = stanza.Pipeline(lang='en', processors='tokenize,pos', tokenize_pretokenized=True) pos_batch = [[(word.text, word.xpos) for word in s.words] for s in nlp_stanza(sent_tok).sentences] return [item for sublist in pos_batch for item in sublist]
def main(): #nlp = stanza.Pipeline('en', # processors='tokenize,pos,lemma,ner') nlp = stanza.Pipeline('en', processors='tokenize') doc = nlp('Uro ruined modern. Fortunately, Wotc banned him') print(process_doc(doc, "him", "ruined"))
def __init__(self): self.nlp = stanza.Pipeline("id", use_gpu=False) self.stemmer = StemmerFactory().create_stemmer() self.ner = get_entities # Set POS Tagger self.pos_tagger = nltk.tag.CRFTagger() self.pos_tagger.set_model_file( 'pretrained/pos_tagger/all_indo_man_tag_corpus_model.crf.tagger')
def __init__(self): super().__init__() # stanza.download('ja') self.nlp = stanza.Pipeline('ja') self.jmd = Jamdict() self._translate_jmd_cache = {}
def __init__(self, file, soup, test_csv): self.soup = soup self.document = test_csv self.s = re.split("_|\.", file) self.id = self.s[3] self.abstract = [] self.nlp = stanza.Pipeline(lang='en', processors='tokenize') self.claims_start = 1500
def main(): nlp = stanza.Pipeline('en', processors='tokenize,pos,lemma,depparse') doc = nlp('Unban Mox Opal! Unban Mox Opal!') #print(doc.sentences[0].dependencies) print(doc) print(process_doc(doc, "{}=source >obj=zzz {}=target"))
def nlp(doc): """ Processes a text with spacy and stanza """ snlp = stanza.Pipeline(lang="la") NLP = StanzaLanguage(snlp) return NLP(doc)
def _load_stanza( stanza_setup: Dict[str, str] = { "lang": "en", "package": "genia", "processors": {"ner": "bionlp13cg"}, }, use_gpu: bool = True, ) -> stanza.Pipeline: # TODO: [RICO -> put use_gpu inside one config] print("loading stanza", stanza_setup) try: snlp = stanza.Pipeline(**stanza_setup, use_gpu=use_gpu) except: stanza.download(**stanza_setup) snlp = stanza.Pipeline(**stanza_setup, use_gpu=use_gpu) return snlp
def test_zh_tokenizer_parens(): """ The original fix for newlines in Chinese text broke () in Chinese text """ nlp = stanza.Pipeline(lang='zh', processors="tokenize", dir=TEST_MODELS_DIR) doc = nlp(ZH_PARENS_DOC)
def test_spacy_stanza_tokenizer_options(): lang = "en" stanza.download(lang) snlp = stanza.Pipeline('en', processors={'tokenize': 'spacy'}) nlp = StanzaLanguage(snlp) # whitespace tokens from spacy tokenizer are handled correctly doc = nlp(" Barack Obama was born\n\nin Hawaii.") snlp = stanza.Pipeline('en', tokenize_pretokenized=True) nlp = StanzaLanguage(snlp) # pretokenized text is handled correctly (possibly with warnings because # the character offsets from stanza 1.0.0 are incorrect) doc = nlp( "Barack Obama was born in Hawaii.\nBarack Obama was born in Hawaii.") doc = nlp( " Barack Obama was born\n\n in Hawaii.\nBarack Obama was born in Hawaii." )
def set_language(self, lang=None): self.lang = lang if lang is None: return if not exists_file(home_dir() + '/stanza_resources/' + lang): stanza.download(lang) self.nlp = stanza.Pipeline(lang=lang, logging_level='ERROR')