def __init__(self, snlp, meta=None, **kwargs): """Initialize the Language class. Instead of "en" etc. we call the language "stanza_en" to not cause conflicts with spaCy's built-in languages. Using entry points, this also allows serializing and deserializing the language class and "lang": "stanza_en" in the meta.json will automatically instantiate this class if this package is available. snlp (stanza.Pipeline): The loaded Stanza pipeline. kwargs: Optional config parameters. RETURNS (spacy.language.Language): The nlp object. """ if hasattr(snlp, "lang"): lang = snlp.lang else: # backward compatible with stanza v1.0.0 lang = snlp.processors["tokenize"].config["lang"] self.snlp = snlp self.svecs = StanzaLanguage._find_embeddings(snlp) self.lang = "stanza_" + lang self.Defaults = get_defaults(lang) self.vocab = create_vocab(lang, self.Defaults) self.tokenizer = Tokenizer(snlp, self.vocab) self._components = [] self._disabled = set() self.max_length = kwargs.get("max_length", 10**6) self.batch_size = kwargs.get("batch_size", 256) self._meta = ({ "lang": self.lang, "stanza": snlp.config } if meta is None else dict(meta)) self._path = None self._optimizer = None
def __init__(self,convUD): self.Defaults.lex_attr_getters[LANG]=lambda _text:"eu" try: self.vocab=self.Defaults.create_vocab() self.pipeline=[] except: from spacy.vocab import create_vocab self.vocab=create_vocab("eu",self.Defaults) self._components=[] self._disabled=set() self.tokenizer=ixaKatTokenizer(self.vocab,convUD) self._meta={ "author":"Koichi Yasuoka", "description":"derived from ixaKat", "lang":"eu_ixaKat", "license":"MIT", "name":"eu_ixaKat", "pipeline":"Tokenizer, POS-Tagger, Parser", "spacy_version":">=2.2.2" } self._path=None
def __init__(self,UniDic): self.Defaults.lex_attr_getters[LANG]=lambda _text:"ja" try: self.vocab=self.Defaults.create_vocab() self.pipeline=[] except: from spacy.vocab import create_vocab self.vocab=create_vocab("ja",self.Defaults) self._components=[] self._disabled=set() self.tokenizer=SynChaTokenizer(self.vocab,UniDic) self._meta={ "author":"Koichi Yasuoka", "description":"derived from SynCha-CaboCha-MeCab", "lang":"ja_SynCha_CaboCha_MeCab", "license":"MIT", "name":"SynCha_CaboCha_MeCab", "pipeline":"Tokenizer, POS-Tagger, Parser", "spacy_version":">=2.2.2" } self._path=None
def __init__(self,BERT,Danku): self.Defaults.lex_attr_getters[LANG]=lambda _text:"lzh" try: self.vocab=self.Defaults.create_vocab() self.pipeline=[] except: from spacy.vocab import create_vocab self.vocab=create_vocab("lzh",self.Defaults) self._components=[] self._disabled=set() self.tokenizer=SuParKanbunTokenizer(BERT,Danku,self.vocab) self._meta={ "author":"Koichi Yasuoka", "description":"derived from SuParKanbun", "lang":"SuParKanbun_lzh", "license":"MIT", "name":"SuParKanbun_lzh", "parent_package":"suparkanbun", "pipeline":"Tokenizer, POS-Tagger, Parser", "spacy_version":">=2.1.0" } self._path=None
def __init__(self,UniDic,UDPipe): self.Defaults.lex_attr_getters[LANG]=lambda _text:"ja" try: self.vocab=self.Defaults.create_vocab() self.pipeline=[] except: from spacy.vocab import create_vocab self.vocab=create_vocab("ja",self.Defaults) self._components=[] self._disabled=set() self.tokenizer=UniDicTokenizer(UniDic,UDPipe,self.vocab) self._meta={ "author":"Koichi Yasuoka", "description":"derived from UniDic2UD", "lang":"UniDic_"+UniDic if UniDic!=None else "udpipe_ja-modern", "license":"MIT", "name":UniDic if UniDic!=None else "ja-modern", "parent_package":"spacy_unidic", "pipeline":"Tokenizer, POS-Tagger, Parser", "spacy_version":">=2.1.0" } self._path=None
def __init__(self, api): self.Defaults.lex_attr_getters[LANG] = lambda _text: "cop" try: self.vocab = self.Defaults.create_vocab() self.pipeline = [] except: from spacy.vocab import create_vocab self.vocab = create_vocab("cop", self.Defaults) self._components = [] self._disabled = set() self.tokenizer = CopticTokenizer(api, self.vocab) self._meta = { "author": "Koichi Yasuoka", "description": "derived from Coptic-NLP", "lang": "Coptic_NLP_cop", "license": "MIT", "name": "Coptic_NLP_cop", "parent_package": "Coptic-NLP", "pipeline": "Tokenizer, POS-Tagger, Parser", "spacy_version": ">=2.1.0" } self._path = None
def __init__(self, udpipe_model: UDPipeModel, meta: Optional[Dict] = None, **kwargs): """Initialize the Language class. The language is called "udpipe_en" instead of "en" in order to avoid any potential conflicts with spaCy's built-in languages. Using entry points, this enables serializing and deserializing the language class and "lang": "udpipe_en" in the meta.json will automatically instantiate this class if this package is available. udpipe_model: The loaded UDPipe model. meta: spaCy model metadata. kwargs: Optional config parameters. """ self.udpipe = udpipe_model self.Defaults = get_defaults(lang=udpipe_model._lang) self.lang = f"udpipe_{udpipe_model._lang}" ignore_tag_map = kwargs.get("ignore_tag_map", False) if ignore_tag_map: self.Defaults.tag_map = {} # workaround for ValueError: [E167] if SPACY_V3: from spacy.vocab import create_vocab from spacy.language import DEFAULT_CONFIG self.vocab = create_vocab(udpipe_model._lang, self.Defaults) self.batch_size = 1000 self._components = [] self._disabled = set() self._config = DEFAULT_CONFIG.merge(self.default_config) else: self.vocab = self.Defaults.create_vocab() self.pipeline = [] self.tokenizer = UDPipeTokenizer(model=self.udpipe, vocab=self.vocab) self.max_length = kwargs.get("max_length", 10**6) self._meta = self.udpipe._meta if meta is None else dict(meta) self._path = None self._optimizer = None