def preprocessing(language: str): from benepar.spacy_plugin import BeneparComponent import zh_core_web_trf import en_core_web_trf global ucb_parser if language == 'zh': nlp = zh_core_web_trf.load() ucb_parser = BeneparComponent('benepar_zh') elif language == 'en': nlp = en_core_web_trf.load() ucb_parser = BeneparComponent('benepar_en2') else: print('language error') exit(-1) nlp.disable_pipes('tagger', 'parser', 'attribute_ruler') if language == 'en': nlp.disable_pipe('lemmatizer') nlp.add_pipe('component', name='cp_parser', last=True) return nlp
# Evaluation from rouge_score import rouge_scorer # SRL # _jsonnet was really slowing down the process. # pip install jsonnetbin was perfect and cleared the error. # Performance is now at normal speed from allennlp.predictors.predictor import Predictor # Split text in sentences from nltk import tokenize # co-reference resolution # import neuralcoref # neuralcoref not compatible with spacy > 3.0.0 # SPACY import en_core_web_trf nlp = en_core_web_trf.load() print("\nAllenNLP loading predictors...") start = time.time() from T2S.src.utils.coref_utils import coref_with_lemma # SRL predictor = Predictor.from_path( "https://storage.googleapis.com/allennlp-public-models/structured-prediction-srl-bert.2020.12.15.tar.gz" ) end = time.time() print(f"Computation time - {round(end - start, 2)} seconds") DECIMAL_FIGURES = 3 def verb_sem_eval(hyp_lemma, ref_lemma):
def precalculate_spacy_english_lemmatizer(cls, datasets): cls._precalculate_spacy_lemmatizer( en_core_web_trf.load(disable=['ner', 'parser']), datasets, PRECALCULATED_SPACY_ENGLISH_LEMMA_CSV ) # we only need the lemmatizer component, disable the other
def recognize_name_entity(summary): nlp = en_core_web_trf.load() doc = nlp(summary) result = [(X.text, X.label_) for X in doc.ents] return result