def replace(text, repls): pattern = replacer.prepare(repls) return replacer.apply(pattern, text)
''' import re import sys from denis.common import util from denis.common.replacer import replacer _to_remove = [ '.', ',', '!', '?', ':', ';', '>', '<', '"', "'", '(', ')', '{', '}', '[', ']', '\\', '--', '`', ] _to_substitute = util.flatten([_to_remove, [ '-' ]]) _removal_pattern = replacer.prepare(_to_remove, onlyAtEnds=True) _substitution_pattern = replacer.prepare(_to_substitute, onlyAtEnds=False) _digit_normalizers = { r'^[0-9]{1,}(\.[0-9]{1,}){0,1}$': '[DIGITS]', r'^\$[0-9]{1,}(\.[0-9]{1,}){0,1}$': '[MONEY]' } def tokenize(line, clean=True, tolower=True, splitwords=False): tokens = line.strip().split() if clean: cleanTokens = [] for token in tokens: token = token.strip() # only force UTF-8 encoding if still in Python 2 if sys.version[0] == '2':