示例#1
0
def _init_nlp(model_or_lang, is_tokenized, disable_sbd, use_stanfordnlp):
    if model_or_lang is None:
        model_or_lang = 'en' if use_stanfordnlp else 'en_core_web_sm'

    nlp = None
    if use_stanfordnlp:
        from spacy_stanfordnlp import StanfordNLPLanguage
        import stanfordnlp

        snlp = stanfordnlp.Pipeline(lang=model_or_lang,
                                    tokenize_pretokenized=is_tokenized)
        nlp = StanfordNLPLanguage(snlp)
    else:
        # Init model:
        # Initialize model, with custom pipe
        # taking into account 'is_tokenized', 'disable_sbd', and 'include_headers'
        nlp = spacy.load(model_or_lang)
        if is_tokenized:
            nlp.tokenizer = nlp.tokenizer.tokens_from_list
        if disable_sbd:
            nlp.add_pipe(_prevent_sbd, name='prevent-sbd', before='parser')

    conllformatter = ConllFormatter(nlp)
    nlp.add_pipe(conllformatter, last=True)

    return nlp
示例#2
0
class RuleBasedPreprocessor(PreprocessorBase):
    """ For rule based conversion,
    entire conversion should happen in the preprocessor
    """
    def __init__(self) -> None:
        snlp = stanfordnlp.Pipeline(lang='en')  # stanfordnlp python pipeline
        self.nlp = StanfordNLPLanguage(snlp)  # spacy wraper for snlp
        conllformatter = ConllFormatter(self.nlp)
        self.nlp.add_pipe(conllformatter, last=True)
        self.detokenizer = MosesDetokenizer()
        self.vanila_preprocessor = PreprocessorBase()

    def __call__(self, q: str, o: str) -> Tuple[str, Dict]:
        if '_' in q:  # FITB. Do it and return early
            h, meta = self.vanila_preprocessor(q, o)

            return h, meta

        if o in q:
            # most likely a preprocessed FITB question
            meta = {'question': q, 'option': o}

            return q, meta

        # the old code throws UserWarnings
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            q_doc = self.nlp(q)
            o_doc = self.nlp(o)
        try:
            q_conll_dict = parse(q_doc._.conll_str)[0].tokens
            o_conll_dict = parse(o_doc._.conll_str)[0].tokens
        except IndexError:
            logger.error(f"Index error on parse for {q}")
            h = q + ' ' + o
            meta: Dict[str, Any] = {
                'question': q,
                'option': o,
                'conversion_issues': [str(ConversionIssue.UNKNOWN)]
            }

            return h, meta

        rule_q = Question(deepcopy(q_conll_dict))  # type:ignore
        rule_o = AnswerSpan(deepcopy(o_conll_dict))  # type:ignore
        conversion_issues = []
        meta = {'question': q, 'option': o}

        if not rule_q.isvalid:
            conversion_issues.append(ConversionIssue.INVALID_QUESTION)

        if not rule_o.isvalid:
            conversion_issues.append(ConversionIssue.INVALID_OPTION)
        # if conversion issue is encountered just concat q + o

        if conversion_issues:
            h = q + ' ' + o
        else:
            rule_q.insert_answer_default(rule_o)
            h = self.detokenizer.detokenize(rule_q.format_declr(),
                                            return_str=True)
        meta['conversion_issues'] = [str(issue) for issue in conversion_issues]

        if meta['conversion_issues']:
            logger.debug(
                f"Issues {conversion_issues} encountered for {q} + {o}")

        return h, meta
示例#3
0
def spacy_stanfordnlp_en_with_formatter():
    snlp = stanfordnlp.Pipeline(lang='en')
    nlp = StanfordNLPLanguage(snlp)
    conllformatter = ConllFormatter(nlp)
    nlp.add_pipe(conllformatter, last=True)
    return nlp
示例#4
0
def init_parser(
    parser: str = "spacy",
    model_or_lang: str = "en",
    *,
    is_tokenized: bool = False,
    disable_sbd: bool = False,
    parser_opts: Optional[Dict] = None,
    **kwargs,
) -> Language:
    """Initialise a spacy-wrapped parser given a language or model and some options.
    :param parser: which parser to use. Parsers other than 'spacy' need to be installed separately. Valid options are
           'spacy', 'stanfordnlp', 'stanza', 'udpipe'. Note that the spacy-* wrappers of those libraries need to be
           installed, e.g. spacy-stanza. Defaults to 'spacy'
    :param model_or_lang: language model to use (must be installed). Defaults to an English model
    :param is_tokenized: indicates whether your text has already been tokenized (space-seperated). For stanza and
           stanfordnlp, this will also cause sentence segmentation *only* to be done by splitting on new lines.
           See the documentation: https://stanfordnlp.github.io/stanfordnlp/tokenize.html
           See the documentation: https://stanfordnlp.github.io/stanza/tokenize.html
    :param disable_sbd: disables spaCy automatic sentence boundary detection (only works for spaCy)
    :param parser_opts: will be passed to the core pipeline. For spacy and udpipe, it will be passed to their
           `.load()` initialisations, for stanfordnlp and stanza `pipeline_opts` is passed to to their `.Pipeline()`
           initialisations
    :param kwargs: options to be passed to the ConllFormatter initialisation
    :return: an initialised Language object; the parser
    """
    parser_opts = {} if parser_opts is None else parser_opts

    if parser == "spacy":
        nlp = spacy.load(model_or_lang, **parser_opts)
        if is_tokenized:
            nlp.tokenizer = SpacyPretokenizedTokenizer(nlp.vocab)
        if disable_sbd:
            nlp.add_pipe(_prevent_sbd, name="prevent-sbd", before="parser")
    elif parser == "stanfordnlp":
        from spacy_stanfordnlp import StanfordNLPLanguage
        import stanfordnlp

        snlp = stanfordnlp.Pipeline(lang=model_or_lang,
                                    tokenize_pretokenized=is_tokenized,
                                    **parser_opts)
        nlp = StanfordNLPLanguage(snlp)
    elif parser == "stanza":
        import stanza
        from spacy_stanza import StanzaLanguage

        snlp = stanza.Pipeline(lang=model_or_lang,
                               tokenize_pretokenized=is_tokenized,
                               **parser_opts)
        nlp = StanzaLanguage(snlp)
    elif parser == "udpipe":
        import spacy_udpipe

        nlp = spacy_udpipe.load(model_or_lang, **parser_opts)
    else:
        raise ValueError(
            "Unexpected value for 'parser'. Options are: 'spacy', 'stanfordnlp', 'stanza', 'udpipe'"
        )

    conllformatter = ConllFormatter(nlp, **kwargs)
    nlp.add_pipe(conllformatter, last=True)

    return nlp
示例#5
0
from typing import *
import json

# In[2]:

# stanfordnlp.download('en')
# Config
input_path = Path('train_has_following.json')
output_path = Path('.data/RACE/train_has_following_reconverted.json')

# In[3]:

snlp = stanfordnlp.Pipeline(lang='en')
nlp = StanfordNLPLanguage(snlp)
conllformatter = ConllFormatter(nlp)
nlp.add_pipe(conllformatter, last=True)
detokenizer = MosesDetokenizer()

# In[4]:

# load data
with open(input_path) as f:
    samples = json.load(f)

# In[5]:

warnings.filterwarnings("ignore")


def convert2(qa_sample: Dict):
    q_doc = nlp(qa_sample['question'])