def remove_numbers(text): """Remove numbers from text.""" # If text is empty, return None. if not text: return None # If texts is tokenized, merge tokens. if is_tokenized(text): was_tokenized = True normalized_text = merge_tokens(text) else: was_tokenized = False normalized_text = text # Remove all tailing white spaces. normalized_text = normalized_text.strip() # Replace all special characters with spaces. normalized_text = sub(get_number_pattern(), r' ', normalized_text) # Then remove multiple adjacent spaces. normalized_text = sub(' +', ' ', normalized_text) # Strip text. normalized_text = normalized_text.strip() # If text was tokenized, re-tokenize text. if was_tokenized: normalized_text = word_tokenize(normalized_text) # Return normalized text. return normalized_text
def convert_case(text, to_lower=True): """Converts text to defined case.""" # If text is empty, return None. if not text: return None # If texts is tokenized, merge tokens. if is_tokenized(text): was_tokenized = True normalized_text = merge_tokens(text) else: was_tokenized = False normalized_text = text # If to lower, convert to lower case. Else, convert to upper case. if to_lower: normalized_text = normalized_text.lower() else: normalized_text = normalized_text.upper() # If text was tokenized, re-tokenize text. if was_tokenized: normalized_text = word_tokenize(normalized_text) # Return normalized text. return normalized_text
def remove_end_characters(text): """Removes end characters from word token list.""" # If text is empty, return None. if not text: return None # If text is not tokenize, tokenize text. if is_tokenized(text): was_tokenized = True normalized_text = merge_tokens(text) else: was_tokenized = False normalized_text = text normalized_text += ' ' # Replace stopwords with spaces. normalized_text = sub(get_end_characters_pattern(), r' ', normalized_text) # Then remove multiple adjacent spaces. normalized_text = sub(' +', ' ', normalized_text) # Then strip text. normalized_text = normalized_text.strip() if normalized_text[-1] == r'.': normalized_text = normalized_text[:-1] # If text was tokenized, then re-tokenize. if was_tokenized: normalized_text = word_tokenize(normalized_text) # Return normalized text. return normalized_text
def expand_abbreviations(text): """Expands contractions in text.""" # If text is empty, return None. if not text: return None # If texts is tokenized, merge tokens. if is_tokenized(text): was_tokenized = True normalized_text = merge_tokens(text) else: was_tokenized = False normalized_text = text # If last character is not space, add space. try: if normalized_text[-1] != ' ': normalized_text += ' ' except IndexError: print(1) # Creates abbreviations pattern. abbreviations_pattern = compile('({})'.format(r'\.?\s|'.join( get_abbreviation_dict().keys())), flags=IGNORECASE | DOTALL) def expand_match(abbreviation): """Expands matched contraction.""" # Retrieves matched contraction from string. match = abbreviation.group(0) # If last character is space, remove space. if match[-1] == " ": match = match[:-1] remove_space = True else: remove_space = False # If last character is dot, remove dot. if match[-1] == r'.': match = match[:-1] # Find expanded contraction in dictionary, based on contraction key. expanded_contraction = get_abbreviation_dict().get(match.lower()) if not expanded_contraction: return abbreviation.group(0) if remove_space: expanded_contraction += " " # Add first character to expanded contraction. return expanded_contraction # Replaces contractions with expanded contractions in text. normalized_text = abbreviations_pattern.sub(expand_match, normalized_text) # Strip text. normalized_text = normalized_text.strip() # If text was tokenized, re-tokenize text. if was_tokenized: normalized_text = word_tokenize(normalized_text) # Return expanded text. return normalized_text
def clean_text(text, wordtokenize=False): clean_dict = {} # Replace multiple whitespaces. clean_dict['replace_whitespaces'] = replace_whitespaces(text) # Replace multiple stopwords. clean_dict['replace_multiple_stopwords'] = replace_multiple_stopwords(clean_dict['replace_whitespaces']) # Replace apostrophes. clean_dict['replace_apostrophes'] = replace_apostrophes(clean_dict['replace_multiple_stopwords']) # Expand contractions. clean_dict['expand_contractions'] = expand_contractions(clean_dict['replace_apostrophes']) # Remove hyperlinks. clean_dict['remove_hyperlinks'] = remove_hyperlinks(clean_dict['expand_contractions']) # Remove special characters. clean_dict['remove_special_characters'] = remove_special_characters(clean_dict['remove_hyperlinks']) # Remove numbers. clean_dict['remove_numbers'] = remove_numbers(clean_dict['remove_special_characters']) # Convert to lower case. clean_dict['convert_case'] = convert_case(clean_dict['remove_numbers']) # Expand abbreviations. clean_dict['expand_abbreviations'] = expand_abbreviations(clean_dict['convert_case']) # Tokenize sentences. temp_sentence = correct_pontuation(clean_dict['expand_abbreviations']) temp_sentence = replace_whitespaces(temp_sentence) clean_dict['sentence_tokenize'] = sentence_tokenize(temp_sentence) # If sentence tokenize is empty, return None. if not clean_dict['sentence_tokenize']: return clean_dict, None else: # Remove end characters. clean_dict['remove_end_characters'] = [remove_end_characters(item) for item in clean_dict['sentence_tokenize'] if len(item) > 1] # Lemmatize words. clean_dict['lemmatize'] = [lemmatize_text(item) for item in clean_dict['remove_end_characters'] if len(item) > 1] # Remove stopwords. clean_dict['remove_stopwords'] = [remove_stopwords(item) for item in clean_dict['lemmatize'] if len(item) > 1] # If tokenize, tokenize words. if wordtokenize: clean_dict['sentence_word_tokenize'] = [word_tokenize(item, 'whitespace') for item in clean_dict['remove_stopwords'] if len(item) > 1] # Return dictionary and cleaned text. non_tokenized_result = convert_tokens_to_string_of_words(clean_dict['sentence_word_tokenize']) clean_dict['direct_word_tokenize'] = word_tokenize(non_tokenized_result, 'whitespace') return clean_dict, non_tokenized_result
def lemmatize_text(text): if is_tokenized(text): was_tokenized = True normalized_text = text else: normalized_text = word_tokenize(text, 'whitespace') was_tokenized = False pos_tagged_text = pos_tag_text(normalized_text) lemmatized_text = [ WordNetLemmatizer().lemmatize(word, pos_tag) if pos_tag else word for word, pos_tag in pos_tagged_text ] if not was_tokenized: lemmatized_text = merge_tokens(lemmatized_text) return lemmatized_text
def expand_contractions(text): """Expands contractions in text.""" # If text is empty, return None. if not text: return None # If texts is tokenized, merge tokens. if is_tokenized(text): was_tokenized = True normalized_text = merge_tokens(text) else: was_tokenized = False normalized_text = text # Creates contractions pattern. contractions_pattern = compile('({})'.format('|'.join( get_contraction_dict().keys())), flags=IGNORECASE | DOTALL) def expand_match(contraction): """Expands matched contraction.""" # Retrieves matched contraction from string. match = contraction.group(0) # Stores first character for case sensitivity. first_char = match[0] # Find expanded contraction in dictionary, based on contraction key. expanded_contraction = get_contraction_dict().get(match) # If the contraction could not be found, try again with lower case. if not expanded_contraction: expanded_contraction = get_contraction_dict().get(match.lower()) # Add first character to expanded contraction. expanded_contraction = first_char + expanded_contraction[1:] return expanded_contraction # Replaces contractions with expanded contractions in text. normalized_text = contractions_pattern.sub(expand_match, normalized_text) # Strip text. normalized_text = normalized_text.strip() # If text was tokenized, re-tokenize text. if was_tokenized: normalized_text = word_tokenize(normalized_text) # Return expanded text. return normalized_text
def replace_apostrophes(text): """Replaces apostrophe pattern with '.""" # If text is empty, return None. if not text: return None # If texts is tokenized, merge tokens. if is_tokenized(text): was_tokenized = True normalized_text = merge_tokens(text) else: was_tokenized = False normalized_text = text # Replaces apostrophe pattern with '. normalized_text = sub(get_apostrophe_pattern(), "'", normalized_text) # Strip text. normalized_text = normalized_text.strip() # If was tokenized, re-tokenize text. if was_tokenized: normalized_text = word_tokenize(normalized_text) # Return normalized text. return normalized_text
def replace_multiple_stopwords(text): """Replaces multiple stopwords with single stopwords.""" # If text is empty, return None. if not text: return None # If texts is tokenized, merge tokens. if is_tokenized(text): was_tokenized = True normalized_text = merge_tokens(text) else: was_tokenized = False normalized_text = text # Replaces apostrophe pattern with '. normalized_text = sub('[.!?]+', _get_single_match, normalized_text) # Strip text. normalized_text = normalized_text.strip() # If was tokenized, re-tokenize text. if was_tokenized: normalized_text = word_tokenize(normalized_text) # Return normalized text. return normalized_text
def remove_hyperlinks(text): """Remove hyperlinks from text.""" # If text is empty, return None. if not text: return None # If is tokenized, merge tokens. if is_tokenized(text): was_tokenized = True normalized_text = merge_tokens(text) else: was_tokenized = False normalized_text = text # Replace hyperlinks with space. normalized_text = sub(get_hyperlink_pattern(), r' ', normalized_text) # Then remove multiple adjacent spaces. normalized_text = sub(' +', ' ', normalized_text) # Strip text. normalized_text = normalized_text.strip() # If text was tokenized, re-tokenize text. if was_tokenized: normalized_text = word_tokenize(normalized_text) # Return normalized text. return normalized_text
def remove_stopwords(text): """Remove stopwords from word token list""" # If text is empty, return None. if not text: return None # If text is not tokenize, tokenize text. if is_tokenized(text): was_tokenized = True normalized_text = text else: was_tokenized = False normalized_text = word_tokenize(text, 'whitespace') # Create stopwords list. stop_set = set(stopwords.words('english')) stop_set.update(['amp']) # Filter stopwords from text. normalized_text = [ token for token in normalized_text if token not in stop_set ] # If text was not tokenize, merge tokens. if not was_tokenized: normalized_text = merge_tokens(normalized_text) # Return normalized text. return normalized_text