def text_to_wordlist(text, remove_stopwords=False, stem_words=False): # Clean the text, with the option to remove stopwords and to stem words. # Convert words to lower case and split them # Regex to remove all Non-Alpha Numeric and space special_character_removal = re.compile(r'[^a-z\d ]', re.IGNORECASE) # regex to replace all numerics replace_numbers = re.compile(r'\d+', re.IGNORECASE) # text = text.lower().split() text = text.split() # Optionally, remove stop words if remove_stopwords: stops = set(stopwords.words("english")) text = [w for w in text if not w in stops] text = " ".join(text) # Remove Special Characters text = special_character_removal.sub('', text) # Replace Numbers text = replace_numbers.sub('n', text) # Optionally, shorten words to their stems if stem_words: text = text.split() stemmer = SnowballStemmer('english') stemmed_words = [stemmer.stem(word) for word in text] text = " ".join(stemmed_words) # Return a list of words return (text)
def stopwords_remover(text): text = re.sub('\'\w+', '', text) # Remove ticks and the next character text = ' '.join(word for word in text.split() if word not in STOPWORDS) # remove stopwors from text text = ' '.join(word for word in text.split() if len(word) > 3) # remove stopwors from text return text
def process_sentence(text, objects='False'): ''' Simple and dirty text preprocessing to remove some mispelled words and lemmatize ''' text = text.lower() old_text = text text = text.replace('1', 'one').replace('2','two').replace( '3','three').replace('4','four').replace('5','five').replace('6','six').replace( '.','').replace('contains', 'contain').replace( 'which','').replace('are there','there are').replace( 'there is', '').replace('ablue', 'a blue').replace( 'corner','edge').replace('wall', 'edge').replace('yelow', 'yellow').replace( 'below','beneath').replace( 'brick','block').replace('leats','least').replace('is touching', 'touching') text = re.sub(r'colour([\W])', 'color ', text) text = re.sub(r'colored([\W])', 'color ', text) text = re.sub(r'coloured([\W])', 'color ', text) text = text.split(' ') text = map(correction, [t for t in text if t]) text = [lemmatizer.lemmatize(x) if not x in [u'as',u'wall'] else x for x in text] text = ' '.join(text) if 'that' in text: text = text.replace('that', '') if 'contain' in text or 'ha ' in text: text = text.replace('contain', 'with').replace('ha ','with ') text = re.sub(r'(^|\W)a([\W])', ' one ', text) text = re.sub(r'(^)ll ', ' ', text) text = re.sub(r'(^)t ', 'at ', text) text = ' '.join([t for t in text.split(' ') if t]) text = text.replace('based', 'base') return text
def text_to_wordlist(text, remove_stopwords=False, stem_words=False,comma=True): # Clean the text, with the option to remove stopwords and to stem words. import re # Convert words to lower case and split them text = text.lower().split() # Optionally, remove stop words if remove_stopwords: stops = set(stopwords.words("english")) text = [w for w in text if not w in stops] text = " ".join(text) # Clean the text if not comma: text = re.sub(r"[,:\/\^.$%#+-></\?\=*\\]", " ", text) # origin is [^A-Za-z0-9^,!.\/'+-=?] else: pass # Optionally, shorten words to their stems if stem_words: text = text.split() stemmer = SnowballStemmer('english') stemmed_words = [stemmer.stem(word) for word in text] text = " ".join(stemmed_words) # Return a list of words return text
def contract(text): text = [wn.lemmatize(word) for word in text] text = [ps.stem(word) for word in text] text = "".join([char for char in text if char not in string.punctuation]) text = ' '.join(word for word in text.split() if word not in stopword) # delete stopwors from text return text #Write function to conduct all text contraction
def neg_emo(text): words = text.split(" ") sc = 0 for word in words: if word.lower() in neg_e: sc += 1 return sc
def clean_text(text): text = text.lower() text = REPLACE_BY_SPACE.sub(' ', text) text = BAD_SYMBOLS.sub(' ', text) text = re.sub(r"\'s", " ", text) text = ' '.join(word for word in text.split() if word not in STOPWORDS) return text
def cleanText(text): # Replace non-ASCII characters with printable ASCII. # Use HTML entities when possible if None == text: return '' text = re.sub(r'\x85', '…', text) # replace ellipses text = re.sub(r'\x91', "‘", text) # replace left single quote text = re.sub(r'\x92', "’", text) # replace right single quote text = re.sub(r'\x93', '“', text) # replace left double quote text = re.sub(r'\x94', '”', text) # replace right double quote text = re.sub(r'\x95', '•', text) # replace bullet text = re.sub(r'\x96', '-', text) # replace bullet text = re.sub(r'\x99', '™', text) # replace TM text = re.sub(r'\xae', '®', text) # replace (R) text = re.sub(r'\xb0', '°', text) # replace degree symbol text = re.sub(r'\xba', '°', text) # replace degree symbol # Do you want to keep new lines / carriage returns? These are generally # okay and useful for readability text = re.sub(r'[\n\r\t]+', ' ', text) # remove embedded \n and \r #removes numbers text = re.sub(" \d+", " ", text) # This is a hard-core line that strips everything else. text = re.sub(r'[\x00-\x1f\x80-\xff]', ' ', text) stop_words = set(stopwords.words('english')) text = ' '.join([word for word in text.split() if word not in stop_words]) # tagged_sentence = nltk.tag.pos_tag(text.split()) # text = ' '.join([word for word,tag in tagged_sentence if tag != 'NNP' and tag != 'NNPS' and word != stop_words]) return text
def get_high_frequency_word(textlist): dic = {} for text in textlist: splittest = text.split(" ") for word in splittest: if (word == "" ): # spilt 后需要清洗============================================== continue if dic: flag = False for dicword in dic: if (word == dicword): flag = True dic[word] += 1 break if (flag == False): dic[word] = 1 else: dic[word] = 1 sorted_dict = sorted(dic.items(), key=lambda x: x[1], reverse=True) # print(sorted_dict) # 倒序排列词频 打印------------------------------------- # print(len(sorted_dict)) # print("---------------------------------------------") # low_frequency_word =[] high_frequency_word = [] count = 0 for item in sorted_dict: if (item[1] > 20): count += 1 high_frequency_word.append(item[0]) count *= 0.7 count = numpy.math.ceil(count) return high_frequency_word[:count]
def text_to_word_sequence(text, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True, split=" "): """Converts a text to a sequence of words (or tokens). # Arguments text: Input text (string). filters: Sequence of characters to filter out. lower: Whether to convert the input to lowercase. split: Sentence split marker (string). # Returns A list of words (or tokens). """ if lower: text = text.lower() if sys.version_info < (3,) and isinstance(text, unicode): translate_map = dict((ord(c), unicode(split)) for c in filters) else: translate_map = maketrans(filters, split * len(filters)) text = text.translate(translate_map) #seq = text.split(split) seq = text.split() #seq = word_tokenize(text) #print("text:",seq) #pos_seq = nltk.pos_tag(text) #return [i for i in seq if i] return nltk.pos_tag(seq)
def pos_emo(text): words = text.split(" ") sc = 0 for word in words: if word.lower() in pos_e: sc = 1 return sc
def statistics_unique_words(text): words_set = set() for token in text.split(): words_set.add(token) return len(words_set)
def clean_contractions(text, mapping): specials = ["’", "‘", "´", "`"] for s in specials: text = text.replace(s, "'") text = ' '.join( [mapping[t] if t in mapping else t for t in text.split(" ")]) return text
def swear_number(text): words = text.split(" ") sc = 0 for word in words: if word.lower() in sl: sc = +1 return sc
def apply_cleaning_function(self, fn, texts, description=""): result = [fn(text) for text in texts] sentences = [text.split() for text in result] tf_dict = self.build_tf_dict(sentences) oov = self.check_coverage(tf_dict) # print(oov[:10]) return result
def text_to_data(self, text, author): #remove newlines, numbers, some punctuation text = text.replace('\n', " ") text = re.sub(r'[0-9]+', '', text) sent_tokenize_list = sent_tokenize(text) total_arr = [(x, author) for x in sent_tokenize_list] vocab_count = len(set(text.split(' '))) return total_arr, vocab_count
def percent(text): count = 0 text = "".join(c for c in text if c not in ('!','.',':','?',';')) words = text.split() for word in words: if word.lower() in stop_words: count +=1 return count / len(words)
def text_to_wordlist(text, remove_stopwords=False, stem_words=False): # Clean the text, with the option to remove stopwords and to stem words. # Convert words to lower case and split them wiki_reg = r'https?://en.wikipedia.org/[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]' url_reg = r'https?://[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]' ip_reg = '\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}' WIKI_LINK = ' WIKI_LINK ' URL_LINK = ' URL_LINK ' IP_LINK = ' IP_LINK ' #clear link c = re.findall(wiki_reg, text) for u in c: text = text.replace(u, WIKI_LINK) c = re.findall(url_reg, text) for u in c: text = text.replace(u, WIKI_LINK) c = re.findall(wiki_reg, text) for u in c: text = text.replace(u, URL_LINK) c = re.findall(ip_reg, text) # Regex to remove all Non-Alpha Numeric and space special_character_removal = re.compile(r'[^A-Za-z\d!?*\' ]', re.IGNORECASE) # regex to replace all numerics replace_numbers = re.compile(r'\d+', re.IGNORECASE) # text = text.lower().split() text = text.split() # Optionally, remove stop words if remove_stopwords: stops = set(stopwords.words("english")) text = [w for w in text if not w in stops] text = " ".join(text) # Remove Special Characters text = special_character_removal.sub('', text) # Replace Numbers text = replace_numbers.sub('NUMBERREPLACER', text) # Optionally, shorten words to their stems if stem_words: text = text.split() stemmer = SnowballStemmer('english') stemmed_words = [stemmer.stem(word) for word in text] text = " ".join(stemmed_words) # Return a list of words return (text)
def text_to_wordlist(text, remove_stopwords=True, stem_words=False): # Clean the text, with the option to remove stopwords and to stem words. # Convert words to lower case and split them text = text.lower() # Clean the text text = re.sub(r"[^A-Za-z0-9']", " ", text) text = re.sub(r"what's", "", text) text = re.sub(r"\'s", " ", text) text = re.sub(r"\'ve", " have ", text) text = re.sub(r"can't", "cannot ", text) text = re.sub(r"n't", " not ", text) text = re.sub(r"i'm", "i am", text) text = re.sub(r" m ", " am ", text) text = re.sub(r"\'re", " are ", text) text = re.sub(r"\'d", " would ", text) text = re.sub(r"\'ll", " will ", text) text = re.sub(r"(\d+)(k)", r"\g<1>000", text) text = re.sub(r" e g ", " eg ", text) text = re.sub(r" b g ", " bg ", text) text = re.sub(r"\0s", "0", text) text = re.sub(r" 9 11 ", "911", text) text = re.sub(r"e-mail", "email", text) text = re.sub(r"\s{2,}", " ", text) text = re.sub(r"quikly", "quickly", text) text = re.sub(r" usa ", " america ", text) text = re.sub(r" u s ", " america ", text) text = re.sub(r" uk ", " england ", text) text = re.sub(r"imrovement", "improvement", text) text = re.sub(r"intially", "initially", text) text = re.sub(r" dms ", "direct messages ", text) text = re.sub(r"demonitization", "demonetization", text) text = re.sub(r"actived", "active", text) text = re.sub(r"kms", " kilometers ", text) text = re.sub(r" cs ", " computer science ", text) text = re.sub(r" upvotes ", " up votes ", text) text = re.sub(r" iphone ", " phone ", text) text = re.sub(r"\0rs ", " rs ", text) text = re.sub(r"calender", "calendar", text) text = re.sub(r"ios", "operating system", text) text = re.sub(r"programing", "programming", text) text = re.sub(r"bestfriend", "best friend", text) text = re.sub(r"iii", "3", text) text = re.sub(r"the us", "america", text) text = re.sub(r" j k ", " jk ", text) # Optionally, shorten words to their stems if stem_words: text = text.split() stemmer = SnowballStemmer('english') stemmed_words = [stemmer.stem(word) for word in text] text = " ".join(stemmed_words) # Return a list of words return (text)
def _remove_pattern_2(input_text_list): stoplist = read_stopwords() cleaned_text_list = [] for text in input_text_list: text = text.translate(string.punctuation) # Remove puncuation 去除标点 text = text.lower() # Convert words to lower case and split them # text = " ".join(text) # Clean the text text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text) # 除A-Za-z0-9(),!?'`外的字符,去除 text = re.sub(r"what's", "what is ", text) text = re.sub(r"\'s", " ", text) text = re.sub(r"\'ve", " have ", text) text = re.sub(r"n't", " not ", text) text = re.sub(r"i'm", "i am ", text) text = re.sub(r"\'re", " are ", text) text = re.sub(r"\'d", " would ", text) text = re.sub(r"\'ll", " will ", text) text = re.sub(r",", " ", text) text = re.sub(r"\.", " ", text) text = re.sub(r"!", " ! ", text) text = re.sub(r"\/", " ", text) text = re.sub(r"\^", " ^ ", text) text = re.sub(r"\+", " + ", text) text = re.sub(r"\-", " - ", text) text = re.sub(r"\=", " = ", text) text = re.sub(r"'", " ", text) text = re.sub(r"(\d+)(k)", r"\g<1>000", text) text = re.sub(r":", " : ", text) text = re.sub(r" e g ", " eg ", text) text = re.sub(r" b g ", " bg ", text) text = re.sub(r" u s ", " american ", text) text = re.sub(r"\0s", "0", text) text = re.sub(r" 9 11 ", "911", text) text = re.sub(r"e - mail", "email", text) text = re.sub(r"j k", "jk", text) text = re.sub(r"\s{2,}", " ", text) text = re.sub(r"https://t.co/[A-Za-z]{10}", " ", text) text = text.split() text = [word for word in text if word not in stoplist] ## 在提取词根前清除一次停用词 stemmer = SnowballStemmer('english') stemmed_words = [stemmer.stem(word) for word in text] cleanwordlist = [ word for word in stemmed_words if word not in stoplist ] ## 提取词根后,再清除 text = " ".join(cleanwordlist) cleaned_text_list.append(text) return cleaned_text_list
def negations(text): replacer = AntonymReplacer() sent = text.split() noneg = replacer.replace_negations(sent) separator = ' ' out = separator.join(noneg) return out
def bigram_text_to_word_sequence(text, bigram, filters=base_filter(), lower=False, split=" "): '''prune: sequence of characters to filter out ''' if lower: text = text.lower() text = text.translate(string.maketrans(filters, split*len(filters))) seq = text.split(split) sentences = [_f for _f in seq if _f] return bigram(sentences)
def inputpreprocess(text): t = ' '.join([ t for t in text.split() if t not in russian_stop and t not in punctuations and '\n' != t and " " != t ]) t = [s for s in t if "\n" != s and '"' != s] text = "".join(t) text = re.sub(r"^\s+", "", text) return text
def remove_stopwords(text): stop = set(stopwords.words('english')) punctuation = list(string.punctuation) stop.update(punctuation) final_text = [] for i in text.split(): if i.strip().lower() not in stop: final_text.append(i.strip()) return " ".join(final_text)
def text_to_word_sequence(text, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True, split=" "): if lower: text = text.lower() if type(text) == unicode: translate_table = {ord(c): ord(t) for c, t in zip(filters, split * len(filters))} else: translate_table = maketrans(filters, split * len(filters)) text = text.translate(translate_table) seq = text.split(split) return [i for i in seq if i]
def clean_text(text): """ text: a string return: modified initial string """ text = text.lower() # lowercase text text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text #text = BAD_SYMBOLS_RE.sub('', text) # delete symbols which are in BAD_SYMBOLS_RE from text text = ' '.join(word for word in text.split() if word not in stopwords) # delete stopwors from text return text
def removeStopWords(sen): text = re.sub('[^a-zA-z&]', ' ', sen) text = text.lower() text = text.split() ps = PorterStemmer() text = [ ps.stem(word) for word in text if not word in set(stopwords.words('english')) ] text = ' '.join(text) return text
def clean_contractions(text, mapping): ''' credits to: https://www.kaggle.com/christofhenkel/how-to-preprocessing-when-using-embeddings credits to: https://www.kaggle.com/anebzt/quora-preprocessing-model input: current text, contraction mappings output: modify the comments to use the base form from contraction mapping ''' specials = ["’", "‘", "´", "`"] for s in specials: text = text.replace(s, "'") text = ' '.join([mapping[t] if t in mapping else t for t in text.split(" ")]) return text
def text_to_wordlist(text, remove_stopwords=False, stem_words=False): # Clean the text, with the option to remove stopwords and to stem words. # Convert words to lower case and split them text = text.lower().split() # Optionally, remove stop words if remove_stopwords: stops = set(stopwords.words("english")) text = [w for w in text if not w in stops] text = " ".join(text) # Clean the text text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text) text = re.sub(r"what's", "what is ", text) text = re.sub(r"\'s", " ", text) text = re.sub(r"\'ve", " have ", text) text = re.sub(r"can't", "cannot ", text) text = re.sub(r"n't", " not ", text) text = re.sub(r"i'm", "i am ", text) text = re.sub(r"\'re", " are ", text) text = re.sub(r"\'d", " would ", text) text = re.sub(r"\'ll", " will ", text) text = re.sub(r",", " ", text) text = re.sub(r"\.", " ", text) text = re.sub(r"!", " ! ", text) text = re.sub(r"\/", " ", text) text = re.sub(r"\^", " ^ ", text) text = re.sub(r"\+", " + ", text) text = re.sub(r"\-", " - ", text) text = re.sub(r"\=", " = ", text) text = re.sub(r"'", " ", text) text = re.sub(r"(\d+)(k)", r"\g<1>000", text) text = re.sub(r":", " : ", text) text = re.sub(r" e g ", " eg ", text) text = re.sub(r" b g ", " bg ", text) text = re.sub(r" u s ", " american ", text) text = re.sub(r"\0s", "0", text) text = re.sub(r" 9 11 ", "911", text) text = re.sub(r"e ?-? ?mail", "email", text) text = re.sub(r"j k", "jk", text) text = re.sub(r"\s{2,}", " ", text) # Optionally, shorten words to their stems if stem_words: text = text.split() stemmer = SnowballStemmer('english') stemmed_words = [stemmer.stem(word) for word in text] text = " ".join(stemmed_words) # Return a list of words return (text)
def clean_special_chars(text, punct, mapping): ''' credits to: https://www.kaggle.com/christofhenkel/how-to-preprocessing-when-using-embeddings credits to: https://www.kaggle.com/anebzt/quora-preprocessing-model input: current text, punctuations, punctuation mapping output: cleaned text ''' for p in punct: text = text.replace(p, " ") text = ' '.join(text.split()) for p in mapping: text = text.replace(p, mapping[p]) return text