示例#1
0
def process_tokenlist(nlp, tokenlist, enriched=False):
    """process_tokenlist: creates a spacy doc element of a token list

    :param nlp: spacy NLP element
    :param tokenlist: list of dicts containing tokens and parameters
    :param enriched: if set to True spacy pipeline is run
    """
    json = {}
    json['tokenArray'] = tokenlist
    ar_tok = [x['value'] for x in json['tokenArray']]
    ar_wsp = [x.get('whitespace', True) for x in json['tokenArray']]
    if Token.get_extension('tokenId') is None:
        Token.set_extension('tokenId', default=False)
    doc = Doc(nlp.vocab, words=ar_tok, spaces=ar_wsp)
    for id, t in enumerate(doc):
        t._.set('tokenId', json['tokenArray'][id].get('tokenId', False))
        t_type = json['tokenArray'][id].get('type', False)
        if not t.tag_ and t_type:
            t.tag_ = t_type
        for k in json['tokenArray'][id].keys():
            if k.upper() in SPACY_ACCEPTED_DATA:
                setattr(
                    t,
                    k.lower(),
                    json['tokenArray'][id][k],
                )  # TODO: need to set ent_iob
    if enriched:
        for name, proc in nlp.pipeline:
            doc = proc(doc)
    return doc
示例#2
0
    def __init__(self,
                 data_dir=DATA_DIR,
                 model_dir_path=None,
                 lexicon_file_path=None,
                 tag_file_path=None,
                 package=PACKAGE,
                 url=URL_MODEL,
                 print_probas=False):
        super(POSTagger, self).__init__(package,
                                        url=url,
                                        download_dir=data_dir)
        if not tk.get_extension(self.name):
            tk.set_extension(self.name, default=None)
        else:
            LOGGER.info('Token {} already registered'.format(self.name))

        model_dir_path = model_dir_path if model_dir_path else os.path.join(
            data_dir, package, 'models/fr')
        lexicon_file_path = lexicon_file_path if lexicon_file_path else os.path.join(
            model_dir_path, 'lexicon.json')
        tag_file_path = tag_file_path if tag_file_path else os.path.join(
            model_dir_path, 'tag_dict.json')

        LOGGER.info("  TAGGER: Loading lexicon...")
        self.lex_dict = unserialize(lexicon_file_path)
        LOGGER.info("  TAGGER: Loading tags...")
        self.tag_dict = unserialize(tag_file_path)
        self.classifier = MaxEntClassifier()
        self.cache = {}
        self._load_model(model_dir_path)
        # print the probability of the tag along to the tag itself
        self.print_probas = print_probas
        return
示例#3
0
文件: __init__.py 项目: sorami/ginza
class JapaneseDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters[LANG] = lambda _text: "ja"
    stop_words = STOP_WORDS
    tag_map = TAG_MAP
    syntax_iterators = SYNTAX_ITERATORS
    writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}

    if not Token.get_extension('inf'):
        Token.set_extension('inf', default='')
    if not Token.get_extension('reading'):
        Token.set_extension('reading', default='')
    if not Token.get_extension('sudachi'):
        Token.set_extension('sudachi', default='')
    if not Token.get_extension('bunsetu_index'):
        Token.set_extension('bunsetu_index', default='')
    if not Token.get_extension('bunsetu_bi_label'):
        Token.set_extension('bunsetu_bi_label', default='')
    if not Token.get_extension('bunsetu_position_type'):
        Token.set_extension('bunsetu_position_type', default='')
    if not Token.get_extension('ne'):
        Token.set_extension('ne', default='')

    @classmethod
    def create_tokenizer(cls, nlp=None):
        return SudachipyTokenizer(nlp)

    @classmethod
    def create_lemmatizer(cls, nlp=None, lookups=None):
        return None
    def _add_custom_spacy_extensions(self):
        for n, f in self.is_attrs_name2func:
            ext = Token.get_extension(n)
            if ext is None:
                Token.set_extension(n, getter=f, force=True)
        for item in [Span, Doc]:
            for n, f in self.has_attrs_name2func:
                ext = item.get_extension(n)
                if ext is None:
                    #print(f"Setting: {item}.set_extension({n}, getter= {f})")
                    item.set_extension(n, getter=f, force=True)

        # Add Attr Getters for Span (i.e. Doc.ents)
        for n, f in self.get_attrs_name2func:
            ext = Span.get_extension(n)
            if ext is None:
                Span.set_extension(n, getter=f, force=True)
示例#5
0
 def __init__(self,
              data_dir=DATA_DIR,
              lefff_file_name=LEFFF_FILE_NAME,
              after_melt=False):
     LOGGER.info('New LefffLemmatizer instantiated.')
     # register your new attribute token._.lefff_lemma
     if not Token.get_extension(self.name):
         Token.set_extension(self.name, default=None)
     else:
         LOGGER.info('Token {} already registered'.format(self.name))
     # In memory lemma mapping
     self.lemma_dict = {}
     self.after_melt = after_melt
     with io.open(os.path.join(data_dir, lefff_file_name),
                  encoding='utf-8') as lefff_file:
         LOGGER.info('Reading lefff data...')
         for line in lefff_file:
             els = line.split('\t')
             self.lemma_dict[(els[0], els[1])] = els[2]
     LOGGER.info('Successfully loaded lefff lemmatizer')
示例#6
0
 def __init__(self,
              data_dir=DATA_DIR,
              lexicon_file_name=LEXICON_FILE,
              tag_file_name=TAG_DICT,
              print_probas=False):
     super(POSTagger, self).__init__(PACKAGE,
                                     url=URL_MODEL,
                                     download_dir=DATA_DIR)
     if not tk.get_extension(self.name):
         tk.set_extension(self.name, default=None)
     else:
         LOGGER.info('Token {} already registered'.format(self.name))
     LOGGER.info("  TAGGER: Loading lexicon...")
     self.lex_dict = unserialize(lexicon_file_name)
     LOGGER.info("  TAGGER: Loading tags...")
     self.tag_dict = unserialize(tag_file_name)
     self.classifier = MaxEntClassifier()
     self.cache = {}
     self.load_model()
     # print the probability of the tag along to the tag itself
     self.print_probas = print_probas
     return
示例#7
0
文件: __init__.py 项目: yut148/ginza
from .sudachi_tokenizer import SudachiTokenizer, LANG_NAME, TAG_MAP, SUDACHI_DEFAULT_MODE
from .parse_tree import correct_dep
from .syntax_iterators import SYNTAX_ITERATORS, noun_chunks

__all__ = [
    'Japanese',
    'JapaneseCorrector',
    'load_model',
    'save_model',
    'create_model_path',
]

Language.factories['JapaneseCorrector'] = lambda nlp, **cfg: JapaneseCorrector(
    nlp)

if not Token.get_extension('pos_detail'):
    Token.set_extension('pos_detail', default='')
if not Token.get_extension('inf'):
    Token.set_extension('inf', default='')


class JapaneseDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters[LANG] = lambda text: LANG_NAME

    tag_map = TAG_MAP
    syntax_iterators = SYNTAX_ITERATORS  # TODO not works for spaCy 2.0.12, see work around in JapaneseCorrector

    @classmethod
    def create_tokenizer(cls, nlp=None):
        return SudachiTokenizer(nlp)
示例#8
0
文件: __init__.py 项目: yyht/ginza
from spacy.vocab import Vocab
from spacy.compat import copy_reg

from .stop_words import STOP_WORDS
from .syntax_iterators import SYNTAX_ITERATORS
from .tag_map import TAG_MAP

from .japanese_corrector import JapaneseCorrector
from .sudachi_tokenizer import SudachiTokenizer

ShortUnitWord = namedtuple("ShortUnitWord", ["surface", "lemma", "pos"])

Language.factories['JapaneseCorrector'] = lambda nlp, **cfg: JapaneseCorrector(
    nlp)

if not Token.get_extension('inf'):
    Token.set_extension('inf', default='')
if not Token.get_extension('bunsetu_bi_label'):
    Token.set_extension('bunsetu_bi_label', default='')
if not Token.get_extension('bunsetu_position_type'):
    Token.set_extension('bunsetu_position_type', default='')


class JapaneseDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters[LANG] = lambda _text: "ja"
    stop_words = STOP_WORDS
    tag_map = TAG_MAP
    syntax_iterators = SYNTAX_ITERATORS  # TODO not works for spaCy 2.0.12, see work around in JapaneseCorrector
    writing_system = {
        "direction": "ltr",