def _init_nlp(model_or_lang, is_tokenized, disable_sbd, use_stanfordnlp): if model_or_lang is None: model_or_lang = 'en' if use_stanfordnlp else 'en_core_web_sm' nlp = None if use_stanfordnlp: from spacy_stanfordnlp import StanfordNLPLanguage import stanfordnlp snlp = stanfordnlp.Pipeline(lang=model_or_lang, tokenize_pretokenized=is_tokenized) nlp = StanfordNLPLanguage(snlp) else: # Init model: # Initialize model, with custom pipe # taking into account 'is_tokenized', 'disable_sbd', and 'include_headers' nlp = spacy.load(model_or_lang) if is_tokenized: nlp.tokenizer = nlp.tokenizer.tokens_from_list if disable_sbd: nlp.add_pipe(_prevent_sbd, name='prevent-sbd', before='parser') conllformatter = ConllFormatter(nlp) nlp.add_pipe(conllformatter, last=True) return nlp
class RuleBasedPreprocessor(PreprocessorBase): """ For rule based conversion, entire conversion should happen in the preprocessor """ def __init__(self) -> None: snlp = stanfordnlp.Pipeline(lang='en') # stanfordnlp python pipeline self.nlp = StanfordNLPLanguage(snlp) # spacy wraper for snlp conllformatter = ConllFormatter(self.nlp) self.nlp.add_pipe(conllformatter, last=True) self.detokenizer = MosesDetokenizer() self.vanila_preprocessor = PreprocessorBase() def __call__(self, q: str, o: str) -> Tuple[str, Dict]: if '_' in q: # FITB. Do it and return early h, meta = self.vanila_preprocessor(q, o) return h, meta if o in q: # most likely a preprocessed FITB question meta = {'question': q, 'option': o} return q, meta # the old code throws UserWarnings with warnings.catch_warnings(): warnings.simplefilter("ignore") q_doc = self.nlp(q) o_doc = self.nlp(o) try: q_conll_dict = parse(q_doc._.conll_str)[0].tokens o_conll_dict = parse(o_doc._.conll_str)[0].tokens except IndexError: logger.error(f"Index error on parse for {q}") h = q + ' ' + o meta: Dict[str, Any] = { 'question': q, 'option': o, 'conversion_issues': [str(ConversionIssue.UNKNOWN)] } return h, meta rule_q = Question(deepcopy(q_conll_dict)) # type:ignore rule_o = AnswerSpan(deepcopy(o_conll_dict)) # type:ignore conversion_issues = [] meta = {'question': q, 'option': o} if not rule_q.isvalid: conversion_issues.append(ConversionIssue.INVALID_QUESTION) if not rule_o.isvalid: conversion_issues.append(ConversionIssue.INVALID_OPTION) # if conversion issue is encountered just concat q + o if conversion_issues: h = q + ' ' + o else: rule_q.insert_answer_default(rule_o) h = self.detokenizer.detokenize(rule_q.format_declr(), return_str=True) meta['conversion_issues'] = [str(issue) for issue in conversion_issues] if meta['conversion_issues']: logger.debug( f"Issues {conversion_issues} encountered for {q} + {o}") return h, meta
def spacy_stanfordnlp_en_with_formatter(): snlp = stanfordnlp.Pipeline(lang='en') nlp = StanfordNLPLanguage(snlp) conllformatter = ConllFormatter(nlp) nlp.add_pipe(conllformatter, last=True) return nlp
def init_parser( parser: str = "spacy", model_or_lang: str = "en", *, is_tokenized: bool = False, disable_sbd: bool = False, parser_opts: Optional[Dict] = None, **kwargs, ) -> Language: """Initialise a spacy-wrapped parser given a language or model and some options. :param parser: which parser to use. Parsers other than 'spacy' need to be installed separately. Valid options are 'spacy', 'stanfordnlp', 'stanza', 'udpipe'. Note that the spacy-* wrappers of those libraries need to be installed, e.g. spacy-stanza. Defaults to 'spacy' :param model_or_lang: language model to use (must be installed). Defaults to an English model :param is_tokenized: indicates whether your text has already been tokenized (space-seperated). For stanza and stanfordnlp, this will also cause sentence segmentation *only* to be done by splitting on new lines. See the documentation: https://stanfordnlp.github.io/stanfordnlp/tokenize.html See the documentation: https://stanfordnlp.github.io/stanza/tokenize.html :param disable_sbd: disables spaCy automatic sentence boundary detection (only works for spaCy) :param parser_opts: will be passed to the core pipeline. For spacy and udpipe, it will be passed to their `.load()` initialisations, for stanfordnlp and stanza `pipeline_opts` is passed to to their `.Pipeline()` initialisations :param kwargs: options to be passed to the ConllFormatter initialisation :return: an initialised Language object; the parser """ parser_opts = {} if parser_opts is None else parser_opts if parser == "spacy": nlp = spacy.load(model_or_lang, **parser_opts) if is_tokenized: nlp.tokenizer = SpacyPretokenizedTokenizer(nlp.vocab) if disable_sbd: nlp.add_pipe(_prevent_sbd, name="prevent-sbd", before="parser") elif parser == "stanfordnlp": from spacy_stanfordnlp import StanfordNLPLanguage import stanfordnlp snlp = stanfordnlp.Pipeline(lang=model_or_lang, tokenize_pretokenized=is_tokenized, **parser_opts) nlp = StanfordNLPLanguage(snlp) elif parser == "stanza": import stanza from spacy_stanza import StanzaLanguage snlp = stanza.Pipeline(lang=model_or_lang, tokenize_pretokenized=is_tokenized, **parser_opts) nlp = StanzaLanguage(snlp) elif parser == "udpipe": import spacy_udpipe nlp = spacy_udpipe.load(model_or_lang, **parser_opts) else: raise ValueError( "Unexpected value for 'parser'. Options are: 'spacy', 'stanfordnlp', 'stanza', 'udpipe'" ) conllformatter = ConllFormatter(nlp, **kwargs) nlp.add_pipe(conllformatter, last=True) return nlp
from typing import * import json # In[2]: # stanfordnlp.download('en') # Config input_path = Path('train_has_following.json') output_path = Path('.data/RACE/train_has_following_reconverted.json') # In[3]: snlp = stanfordnlp.Pipeline(lang='en') nlp = StanfordNLPLanguage(snlp) conllformatter = ConllFormatter(nlp) nlp.add_pipe(conllformatter, last=True) detokenizer = MosesDetokenizer() # In[4]: # load data with open(input_path) as f: samples = json.load(f) # In[5]: warnings.filterwarnings("ignore") def convert2(qa_sample: Dict): q_doc = nlp(qa_sample['question'])