def parse(s, tokenize=True, tags=True, chunks=True, relations=False, lemmata=False, encoding="utf-8", **kwargs): """ Takes a string (sentences) and returns a tagged Unicode string. Sentences in the output are separated by newlines. """ if tokenize: s = _tokenize(s) if isinstance(s, (list, tuple)): s = [isinstance(s, basestring) and s.split(" ") or s for s in s] if isinstance(s, basestring): s = [s.split(" ") for s in s.split("\n")] # Reuse the English parser: kwargs.update({ "lemmata": False, "light": False, "lexicon": LEXICON, "language": "de", "default": "NN", "map": kwargs.get("tagset", "") != STTS and stts2penntreebank or None, }) # The German lexicon uses "ss" instead of "ß". # Instead of simply replacing it, we keep a hash map of the normalized words. # After parsing we restore the "ß" so the output stays identical to the input. m = dict((token.replace(u"ß", "ss"), token) for sentence in s for token in sentence) s = [[token.replace(u"ß", "ss") for token in sentence] for sentence in s] s = _en_parse(s, False, tags, chunks, relations, **kwargs) p = [[[m[token[0]]] + token[1:] for token in sentence] for sentence in s.split()] p = "\n".join([" ".join(["/".join(token) for token in sentence]) for sentence in p]) s = TaggedString(p, tags=s.tags, language="de") # Use pattern.de.inflect for lemmatization: if lemmata: p = [find_lemmata(sentence) for sentence in s.split()] s = TaggedString(p, tags=s.tags+["lemma"], language="de") return s
def parse(s, tokenize=True, tags=True, chunks=True, relations=False, lemmata=False, encoding="utf-8", **kwargs): """ Takes a string (sentences) and returns a tagged Unicode string. Sentences in the output are separated by newlines. """ if tokenize: s = [s.split(" ") for s in _tokenize(s)] # Reuse the English parser: kwargs.update({ "lemmata": False, "light": False, "lexicon": LEXICON, "map": kwargs.get("tagset", "").lower() != WOTAN and wotan2penntreebank or None, "language": "nl" }) s = _en_parse(s, False, tags, chunks, relations, **kwargs) if lemmata: p = [find_lemmata(sentence) for sentence in s.split()] p = "\n".join([ " ".join(["/".join(token) for token in sentence]) for sentence in p ]) s = TaggedString(p, tags=s.tags + ["lemma"], language="nl") return s
def parse(s, tokenize=True, tags=True, chunks=True, relations=False, lemmata=False, encoding="utf-8", **kwargs): """ Takes a string (sentences) and returns a tagged Unicode string. Sentences in the output are separated by newlines. """ if tokenize: s = _tokenize(s) if isinstance(s, (list, tuple)): s = [isinstance(s, basestring) and s.split(" ") or s for s in s] if isinstance(s, basestring): s = [s.split(" ") for s in s.split("\n")] # Reuse the English parser: kwargs.update({ "lemmata": False, "light": False, "lexicon": LEXICON, "language": "fr", "default": "NN", "map": None, }) s = _en_parse(s, False, tags, chunks, relations, **kwargs) # Use pattern.fr.inflect for lemmatization: if lemmata: p = [find_lemmata(sentence) for sentence in s.split()] s = TaggedString(p, tags=s.tags+["lemma"], language="fr") return s
def parse(s, tokenize=True, tags=True, chunks=True, relations=False, lemmata=False, encoding="utf-8", **kwargs): """ Takes a string (sentences) and returns a tagged Unicode string. Sentences in the output are separated by newlines. """ if tokenize: s = _tokenize(s) # Reuse the English parser: kwargs.update({ "lemmata": False, "light": False, "lexicon": LEXICON, "language": "nl", "default": "N(soort,ev,neut)", "map": kwargs.get("tagset", "") != WOTAN and wotan2penntreebank or None, }) s = _en_parse(s, False, tags, chunks, relations, **kwargs) # Use pattern.nl.inflect for lemmatization: if lemmata: p = [find_lemmata(sentence) for sentence in s.split()] s = TaggedString(p, tags=s.tags + ["lemma"], language="nl") return s