예제 #1
0
def tokenize(s, punctuation=PUNCTUATION, abbreviations=["bv.", "blz.", "e.d.", "m.a.w.", "nl."], replace={}):
    # 's in Dutch preceded by a vowel indicates plural ("auto's"): don't replace.
    s = _en_tokenize(s, punctuation, abbreviations, replace)
    s = [s.replace("' s morgens", "'s morgens") for s in s]
    s = [s.replace("' s middags", "'s middags") for s in s]
    s = [s.replace("' s avonds" , "'s avonds" ) for s in s]
    return s
예제 #2
0
def tokenize(s,
             punctuation=PUNCTUATION,
             abbreviations=["bv.", "blz.", "e.d.", "m.a.w.", "nl."],
             replace={}):
    # 's in Dutch preceded by a vowel indicates plural ("auto's"): don't replace.
    s = _en_tokenize(s, punctuation, abbreviations, replace)
    s = [s.replace("' s morgens", "'s morgens") for s in s]
    s = [s.replace("' s middags", "'s middags") for s in s]
    s = [s.replace("' s avonds", "'s avonds") for s in s]
    return s
예제 #3
0
def tokenize(s,
             punctuation=PUNCTUATION,
             abbreviations=abbreviations,
             replace={"'n": " 'n"}):
    # 's in Dutch preceded by a vowel indicates plural ("auto's"): don't replace.
    s = _en_tokenize(s, punctuation, abbreviations, replace)
    s = [
        re.sub(r"' s (ochtends|morgens|middags|avonds)", "'s \\1", s)
        for s in s
    ]
    return s
예제 #4
0
def tokenize(s, punctuation=PUNCTUATION, abbreviations=ABBREVIATIONS, replace={}):
    return _en_tokenize(s, punctuation, abbreviations, replace)
예제 #5
0
파일: __init__.py 프로젝트: navtej/pattern
def tokenize(s, punctuation=PUNCTUATION, abbreviations=abbreviations, replace={"'n": " 'n"}):
    # 's in Dutch preceded by a vowel indicates plural ("auto's"): don't replace.
    s = _en_tokenize(s, punctuation, abbreviations, replace)
    s = [re.sub(r"' s (ochtends|morgens|middags|avonds)", "'s \\1", s) for s in s]
    return s
예제 #6
0
def tokenize(s,
             punctuation=PUNCTUATION,
             abbreviations=ABBREVIATIONS,
             replace={}):
    return _en_tokenize(s, punctuation, abbreviations, replace)
예제 #7
0
def tokenize(s,
             punctuation=PUNCTUATION,
             abbreviations=["bv.", "blz.", "e.d.", "m.a.w.", "nl."],
             replace={}):
    # 's in Dutch preceded by a vowel indicates plural ("auto's"): don't replace.
    return _en_tokenize(s, punctuation, abbreviations, replace)
예제 #8
0
def tokenize(s, punctuation=PUNCTUATION, abbreviations=["bv.", "blz.", "e.d.", "m.a.w.", "nl."], replace={}):
    # 's in Dutch preceded by a vowel indicates plural ("auto's"): don't replace.
    return _en_tokenize(s, punctuation, abbreviations, replace)
예제 #9
0
def tokenize(s, punctuation=PUNCTUATION, abbreviations=ABBREVIATIONS, replace=replacements):
    s = _en_tokenize(s, punctuation, abbreviations, replace)
    s = [s.replace("&rsquo ;", u"’") if isinstance(s, unicode) else s for s in s]
    return s