# We restore the "ß" after parsing. tokens_ss = [t.replace(u"ß", "ss") for t in tokens] tokens_ss = _Parser.find_tags(self, tokens_ss, **kwargs) return [[w] + tokens_ss[i][1:] for i, w in enumerate(tokens)] parser = Parser(lexicon=os.path.join(MODULE, "de-lexicon.txt"), frequency=os.path.join(MODULE, "de-frequency.txt"), morphology=os.path.join(MODULE, "de-morphology.txt"), context=os.path.join(MODULE, "de-context.txt"), default=("NN", "NE", "CARDNUM"), language="de") lexicon = parser.lexicon # Expose lexicon. spelling = Spelling(path=os.path.join(MODULE, "de-spelling.txt")) def tokenize(s, *args, **kwargs): """ Returns a list of sentences, where punctuation marks have been split from words. """ return parser.find_tokens(s, *args, **kwargs) def parse(s, *args, **kwargs): """ Returns a tagged Unicode string. """ return parser.parse(s, *args, **kwargs) def parsetree(s, *args, **kwargs):
return _Parser.find_tags(self, tokens, **kwargs) parser = Parser( lexicon=os.path.join(MODULE, "ru-lexicon.txt"), # A dict of known words => most frequent tag. frequency=os.path.join(MODULE, "ru-frequency.txt"), # A dict of word frequency. model=os.path.join(MODULE, "ru-model.slp"), # A SLP classifier trained on WSJ (01-07). #morphology=os.path.join(MODULE, "en-morphology.txt"), # A set of suffix rules #context=os.path.join(MODULE, "en-context.txt"), # A set of contextual rules. #entities=os.path.join(MODULE, "en-entities.txt"), # A dict of named entities: John = NNP-PERS. #default=("NN", "NNP", "CD"), language="ru" ) spelling = Spelling( path=os.path.join(MODULE, "ru-spelling.txt"), alphabet='CYRILLIC' ) def tokenize(s, *args, **kwargs): """ Returns a list of sentences, where punctuation marks have been split from words. """ return parser.find_tokens(s, *args, **kwargs) def parse(s, *args, **kwargs): """ Returns a tagged Unicode string. """ return parser.parse(s, *args, **kwargs)