def english_window(words, wsize=2): words = filter(None, ( re.sub(r"[^A-Za-z0-9' -]", '', w) for w in words )) for x in xrange(len(words) - wsize + 1): pair = ' '.join(words[x:x+wsize]) caps = ''.join( w[0] for w in pair.split() ) norm = en_nl.normalize(pair) if norm and ( ( '@' not in caps and caps.upper() == caps and pair.upper() != pair ) or norm in concepts ): yield norm.lower()
def english_window(words, wsize=2): words = filter(None, (re.sub(r"[^A-Za-z0-9' -]", '', w) for w in words)) for x in xrange(len(words) - wsize + 1): pair = ' '.join(words[x:x + wsize]) caps = ''.join(w[0] for w in pair.split()) norm = en_nl.normalize(pair) if norm and (('@' not in caps and caps.upper() == caps and pair.upper() != pair) or norm in concepts): yield norm.lower()
def clean_twitter(phrase): phrase = re.sub(r'(.)\1{2,}', r'\1\1', re.sub(r'[^\x00-\x7f]', ' ', phrase)) if is_bad_word(phrase.lower()): return parts = en_nl.tokenize(phrase).split() for part in itertools.chain(parts, english_window(parts)): if part.startswith(('#', '@', 'http:')): yield part elif part.strip() and part != 'rt' and not en_nl.is_stopword(part): part = en_nl.normalize(part).strip('-') if part.strip(): yield part