def clean_twitter(phrase): phrase = re.sub(r'(.)\1{2,}', r'\1\1', re.sub(r'[^\x00-\x7f]', ' ', phrase)) if is_bad_word(phrase.lower()): return parts = en_nl.tokenize(phrase).split() for part in itertools.chain(parts, english_window(parts)): if part.startswith(('#', '@', 'http:')): yield part elif part.strip() and part != 'rt' and not en_nl.is_stopword(part): part = en_nl.normalize(part).strip('-') if part.strip(): yield part
def get_sentences(self): words = en_nl.tokenize(self.text).split() sentences = [] current_sentence = [] for word in words: if word in PUNCTUATION: if len(current_sentence) >= 1: sentences.append(current_sentence) current_sentence = [] else: current_sentence.append(word) sentences.append(current_sentence) return sentences
def extract_concepts_with_negation(text): words = en_nl.tokenize(text).split() return extract_concepts_from_words(words)