def tokenize(s, punctuation=PUNCTUATION, abbreviations=["bv.", "blz.", "e.d.", "m.a.w.", "nl."], replace={}): # 's in Dutch preceded by a vowel indicates plural ("auto's"): don't replace. s = _en_tokenize(s, punctuation, abbreviations, replace) s = [s.replace("' s morgens", "'s morgens") for s in s] s = [s.replace("' s middags", "'s middags") for s in s] s = [s.replace("' s avonds" , "'s avonds" ) for s in s] return s
def tokenize(s, punctuation=PUNCTUATION, abbreviations=["bv.", "blz.", "e.d.", "m.a.w.", "nl."], replace={}): # 's in Dutch preceded by a vowel indicates plural ("auto's"): don't replace. s = _en_tokenize(s, punctuation, abbreviations, replace) s = [s.replace("' s morgens", "'s morgens") for s in s] s = [s.replace("' s middags", "'s middags") for s in s] s = [s.replace("' s avonds", "'s avonds") for s in s] return s
def tokenize(s, punctuation=PUNCTUATION, abbreviations=abbreviations, replace={"'n": " 'n"}): # 's in Dutch preceded by a vowel indicates plural ("auto's"): don't replace. s = _en_tokenize(s, punctuation, abbreviations, replace) s = [ re.sub(r"' s (ochtends|morgens|middags|avonds)", "'s \\1", s) for s in s ] return s
def tokenize(s, punctuation=PUNCTUATION, abbreviations=ABBREVIATIONS, replace={}): return _en_tokenize(s, punctuation, abbreviations, replace)
def tokenize(s, punctuation=PUNCTUATION, abbreviations=abbreviations, replace={"'n": " 'n"}): # 's in Dutch preceded by a vowel indicates plural ("auto's"): don't replace. s = _en_tokenize(s, punctuation, abbreviations, replace) s = [re.sub(r"' s (ochtends|morgens|middags|avonds)", "'s \\1", s) for s in s] return s
def tokenize(s, punctuation=PUNCTUATION, abbreviations=["bv.", "blz.", "e.d.", "m.a.w.", "nl."], replace={}): # 's in Dutch preceded by a vowel indicates plural ("auto's"): don't replace. return _en_tokenize(s, punctuation, abbreviations, replace)
def tokenize(s, punctuation=PUNCTUATION, abbreviations=ABBREVIATIONS, replace=replacements): s = _en_tokenize(s, punctuation, abbreviations, replace) s = [s.replace("&rsquo ;", u"’") if isinstance(s, unicode) else s for s in s] return s