示例#1
0
def parse(s, tokenize=True, tags=True, chunks=True, relations=False, lemmata=False, encoding="utf-8", **kwargs):
    """ Takes a string (sentences) and returns a tagged Unicode string. 
        Sentences in the output are separated by newlines.
    """
    if tokenize:
        s = _tokenize(s)
    if isinstance(s, (list, tuple)):
        s = [isinstance(s, basestring) and s.split(" ") or s for s in s]
    if isinstance(s, basestring):
        s = [s.split(" ") for s in s.split("\n")]
    # Reuse the English parser:
    kwargs.update({
        "lemmata": False,
          "light": False,
        "lexicon": LEXICON,
       "language": "de",
        "default": "NN",
            "map": kwargs.get("tagset", "") != STTS and stts2penntreebank or None,
    })
    # The German lexicon uses "ss" instead of "ß".
    # Instead of simply replacing it, we keep a hash map of the normalized words.
    # After parsing we restore the "ß" so the output stays identical to the input.
    m = dict((token.replace(u"ß", "ss"), token) for sentence in s for token in sentence)
    s = [[token.replace(u"ß", "ss") for token in sentence] for sentence in s]
    s = _en_parse(s, False, tags, chunks, relations, **kwargs)
    p = [[[m[token[0]]] + token[1:] for token in sentence] for sentence in s.split()]
    p = "\n".join([" ".join(["/".join(token) for token in sentence]) for sentence in p])
    s = TaggedString(p, tags=s.tags, language="de")
    # Use pattern.de.inflect for lemmatization:
    if lemmata:
        p = [find_lemmata(sentence) for sentence in s.split()]
        s = TaggedString(p, tags=s.tags+["lemma"], language="de")
    return s
示例#2
0
def parse(s,
          tokenize=True,
          tags=True,
          chunks=True,
          relations=False,
          lemmata=False,
          encoding="utf-8",
          **kwargs):
    """ Takes a string (sentences) and returns a tagged Unicode string. 
        Sentences in the output are separated by newlines.
    """
    if tokenize:
        s = [s.split(" ") for s in _tokenize(s)]
    # Reuse the English parser:
    kwargs.update({
        "lemmata":
        False,
        "light":
        False,
        "lexicon":
        LEXICON,
        "map":
        kwargs.get("tagset", "").lower() != WOTAN and wotan2penntreebank
        or None,
        "language":
        "nl"
    })
    s = _en_parse(s, False, tags, chunks, relations, **kwargs)
    if lemmata:
        p = [find_lemmata(sentence) for sentence in s.split()]
        p = "\n".join([
            " ".join(["/".join(token) for token in sentence]) for sentence in p
        ])
        s = TaggedString(p, tags=s.tags + ["lemma"], language="nl")
    return s
示例#3
0
def parse(s, tokenize=True, tags=True, chunks=True, relations=False, lemmata=False, encoding="utf-8", **kwargs):
    """ Takes a string (sentences) and returns a tagged Unicode string. 
        Sentences in the output are separated by newlines.
    """
    if tokenize:
        s = _tokenize(s)
    if isinstance(s, (list, tuple)):
        s = [isinstance(s, basestring) and s.split(" ") or s for s in s]
    if isinstance(s, basestring):
        s = [s.split(" ") for s in s.split("\n")]
    # Reuse the English parser:
    kwargs.update({
        "lemmata": False,
          "light": False,
        "lexicon": LEXICON,
       "language": "fr",
        "default": "NN",
            "map": None,
    })
    s = _en_parse(s, False, tags, chunks, relations, **kwargs)
    # Use pattern.fr.inflect for lemmatization:
    if lemmata:
        p = [find_lemmata(sentence) for sentence in s.split()]
        s = TaggedString(p, tags=s.tags+["lemma"], language="fr")
    return s
示例#4
0
def parse(s,
          tokenize=True,
          tags=True,
          chunks=True,
          relations=False,
          lemmata=False,
          encoding="utf-8",
          **kwargs):
    """ Takes a string (sentences) and returns a tagged Unicode string. 
        Sentences in the output are separated by newlines.
    """
    if tokenize:
        s = _tokenize(s)
    # Reuse the English parser:
    kwargs.update({
        "lemmata":
        False,
        "light":
        False,
        "lexicon":
        LEXICON,
        "language":
        "nl",
        "default":
        "N(soort,ev,neut)",
        "map":
        kwargs.get("tagset", "") != WOTAN and wotan2penntreebank or None,
    })
    s = _en_parse(s, False, tags, chunks, relations, **kwargs)
    # Use pattern.nl.inflect for lemmatization:
    if lemmata:
        p = [find_lemmata(sentence) for sentence in s.split()]
        s = TaggedString(p, tags=s.tags + ["lemma"], language="nl")
    return s