Python normalize示例

编程语言: Python

命名空间/包名称: standalone_nlp.lang_en.en_nl

方法/功能: normalize

hotexamples.com的示例: 4

Python normalize - 已找到4个示例。这些是从开源项目中提取的最受好评的standalone_nlp.lang_en.en_nl.normalize现实Python示例。您可以评价示例，以帮助我们提高示例质量。

示例#1

显示文件

文件： utils.py 项目： Web5design/twittermap

def english_window(words, wsize=2):
    words = filter(None, ( re.sub(r"[^A-Za-z0-9' -]", '', w) for w in words ))
    for x in xrange(len(words) - wsize + 1):
        pair = ' '.join(words[x:x+wsize])
        caps = ''.join( w[0] for w in pair.split() )
        norm = en_nl.normalize(pair)
        if norm and ( ( '@' not in caps and caps.upper() == caps and pair.upper() != pair ) or norm in concepts ):
            yield norm.lower()

示例#2

显示文件

文件： utils.py 项目： imclab/twittermap

def english_window(words, wsize=2):
    words = filter(None, (re.sub(r"[^A-Za-z0-9' -]", '', w) for w in words))
    for x in xrange(len(words) - wsize + 1):
        pair = ' '.join(words[x:x + wsize])
        caps = ''.join(w[0] for w in pair.split())
        norm = en_nl.normalize(pair)
        if norm and (('@' not in caps and caps.upper() == caps
                      and pair.upper() != pair) or norm in concepts):
            yield norm.lower()

示例#3

显示文件

文件： utils.py 项目： Web5design/twittermap

def clean_twitter(phrase):
    phrase = re.sub(r'(.)\1{2,}', r'\1\1', re.sub(r'[^\x00-\x7f]', ' ', phrase))
    if is_bad_word(phrase.lower()):
        return
    parts = en_nl.tokenize(phrase).split()
    for part in itertools.chain(parts, english_window(parts)):
        if part.startswith(('#', '@', 'http:')):
            yield part
        elif part.strip() and part != 'rt' and not en_nl.is_stopword(part):
            part = en_nl.normalize(part).strip('-')
            if part.strip():
                yield part

示例#4

显示文件

文件： utils.py 项目： imclab/twittermap

def clean_twitter(phrase):
    phrase = re.sub(r'(.)\1{2,}', r'\1\1', re.sub(r'[^\x00-\x7f]', ' ',
                                                  phrase))
    if is_bad_word(phrase.lower()):
        return
    parts = en_nl.tokenize(phrase).split()
    for part in itertools.chain(parts, english_window(parts)):
        if part.startswith(('#', '@', 'http:')):
            yield part
        elif part.strip() and part != 'rt' and not en_nl.is_stopword(part):
            part = en_nl.normalize(part).strip('-')
            if part.strip():
                yield part