def process_sentence(text, objects='False'): ''' Simple and dirty text preprocessing to remove some mispelled words and lemmatize ''' text = text.lower() old_text = text text = text.replace('1', 'one').replace('2','two').replace( '3','three').replace('4','four').replace('5','five').replace('6','six').replace( '.','').replace('contains', 'contain').replace( 'which','').replace('are there','there are').replace( 'there is', '').replace('ablue', 'a blue').replace( 'corner','edge').replace('wall', 'edge').replace('yelow', 'yellow').replace( 'below','beneath').replace( 'brick','block').replace('leats','least').replace('is touching', 'touching') text = re.sub(r'colour([\W])', 'color ', text) text = re.sub(r'colored([\W])', 'color ', text) text = re.sub(r'coloured([\W])', 'color ', text) text = text.split(' ') text = map(correction, [t for t in text if t]) text = [lemmatizer.lemmatize(x) if not x in [u'as',u'wall'] else x for x in text] text = ' '.join(text) if 'that' in text: text = text.replace('that', '') if 'contain' in text or 'ha ' in text: text = text.replace('contain', 'with').replace('ha ','with ') text = re.sub(r'(^|\W)a([\W])', ' one ', text) text = re.sub(r'(^)ll ', ' ', text) text = re.sub(r'(^)t ', 'at ', text) text = ' '.join([t for t in text.split(' ') if t]) text = text.replace('based', 'base') return text
def preprocessing(): PATH = '../input/text-classification-20/' punct = ",./-'?!#$%\'()*+-/:;<=>@[\\]^_`{|}~`" + '""“”’' dataset_length = 5331 neg = pd.DataFrame(index=range(dataset_length), columns=['label', 'text']) infile = open(PATH + 'rt-polarity.neg', encoding='utf-8', errors='ignore') textlines = infile.readlines() for i in range(dataset_length): text = textlines[i].strip('\n').strip() for p in punct: text = text.replace(p, ' ') neg.loc[i, 'text'] = text neg.loc[i, 'label'] = 0 pos = pd.DataFrame(index=range(dataset_length), columns=['label', 'text']) infile = open(PATH + 'rt-polarity.pos', encoding='utf-8', errors='ignore') textlines = infile.readlines() for i in range(dataset_length): text = textlines[i].strip('\n').strip() for p in punct: text = text.replace(p, ' ') pos.loc[i, 'text'] = text pos.loc[i, 'label'] = 1 train_df = pd.concat([neg, pos], ignore_index=True) train_df.to_csv('Sentiment_dataset.csv', index=False, encoding='utf-8') x_train = train_df['text'].astype(str) y_train = train_df['label'].values.astype(int) # print(np.isnan(x_train).any()) return x_train, y_train
def subchar(text): text = text.replace("á", "a") text = text.replace("ó", "o") text = text.replace("é", "e") text = text.replace("í", "i") text = text.replace("ú", "u") return text
def preprocess(text): text = text.lower() list_happen = [ "😊", "❤️", "😁", "😄", "😆", "😍", "🤣", "😂", "🤩", "😚", "😋", '😜', "😝", "🤗", ":)", ":}", "^^", ";)", "👌", "=))", "😅", "👍", "👍🏻", "💕", "❤", "👏", "💟", "<3", ":D", ":P", "^_^", "😉", "✌️" ] list_sad = [ "😡", "🤔", "🤨", "😐", "😏", "😒", "😶", "🙄", "😌", "😔", "🤕", "🤒", "👿", "🤬", "😤", '😫', "😩", "😭", ":(", "😈", "-_-", "👎" ] for happen in list_happen: text = text.replace(happen, " vui") for sad in list_sad: text = text.replace(sad, " buồn") text = re.sub( '[\n!,.?@#?!.,#$%\()*+-/:;<=>@[\\]^_`{|}~`"""“”’∞θ÷α•−β∅³π‘₹´°£€\×™√²—–&]', '', text) # text = preprocess1(text) text = ViTokenizer.tokenize(text) # emoticons = re.findall(r"(?:|;|=)(?:-)?(?:\)\(|D|P)", text) # text = re.sub(r"[\W]+", " ", text.lower()) + " ".join(emoticons).replace('-', '') # text = re.sub("\n", ' ', text) return text
def preprocessFastText(text): text = text.replace("' ", " ' ") signs = set(';:,.?!\'“”‘’\"') prods = set(text) & signs # text 기호들 추출 if not prods: return text # 기호 없는 text return for sign in prods: text = text.replace(sign, ' {} '.format(sign) ) #4) return text
def preprocessFastText(text): text = text.replace("' ", " ' ") signs = set(',.:;"?!') prods = set(text) & signs if not prods: return text for sign in prods: text = text.replace(sign, ' {} '.format(sign) ) return text
def clean_preprocess(text, punct = punct, mapping = punct_mapping, mispell = mispell_dict): for p in mapping: text = text.replace(p, mapping[p]) for p in punct: text = text.replace(p, ' ') specials = {'\u200b': ' ', '…': ' ... ', '\ufeff': '', 'करना': '', 'है': ''} # Other special characters that I have to deal with in last for s in specials: text = text.replace(s, specials[s]) for word in mispell.keys(): text = text.replace(word, mispell[word]) return text
def preprocess(text): # text = re.sub('[\n!,.?@#]', '', text) text = text.lower() list_happen = ["😊","❤️","😁","😄","😆","😍","🤣","😂","🤩","😚","😋",'😜',"😝","🤗",":)",":}","^^",";)", "👌","=))","😅","👍","👍🏻","💕","❤","👏","💟","<3",":D",":P","^_^","😉","✌️"] list_sad = ["😡","🤔","🤨","😐","😏","😒","😶","🙄","😌","😔","🤕","🤒","👿","🤬","😤",'😫',"😩","😭",":(","😈","-_-","👎"] for happen in list_happen: text = text.replace(happen, "vui") for sad in list_sad: text = text.replace(sad, "tệ") # text = ViTokenizer.tokenize(text) return text
def clean_special_chars(text, punct, mapping): ''' credits to: https://www.kaggle.com/christofhenkel/how-to-preprocessing-when-using-embeddings credits to: https://www.kaggle.com/anebzt/quora-preprocessing-model input: current text, punctuations, punctuation mapping output: cleaned text ''' for p in mapping: text = text.replace(p, mapping[p]) for p in punct: text = text.replace(p, f' {p} ') return text
def clean_special_chars(text, punct, mapping): for p in mapping: text = text.replace(p, mapping[p]) for p in punct: text = text.replace(p, ' {p} ') specials = {'\u200b': ' ', '…': ' ... ', '\ufeff': '', 'करना': '', 'है': ''} # Other special characters that I have to deal with in last for s in specials: text = text.replace(s, specials[s]) return text
def preprocess(text): s_mapping = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not", "didn't": "did not", "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not", "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is", "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would", "i'd've": "i would have", "i'll": "i will", "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would", "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as", "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would", "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have", "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are", "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is", "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have", "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have","you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have" } specials = ["’", "‘", "´", "`"] p_mapping = {"_":" ", "`":" "} punct = "/-'?!.,#$%\'()*+-/:;<=>@[\\]^_`{|}~" + '""“”’' + '∞θ÷α•à−β∅³π‘₹´°£€\×™√²—–&' for s in specials: text = text.replace(s, "'") text = ' '.join([s_mapping[t] if t in s_mapping else t for t in text.split(" ")]) for p in p_mapping: text = text.replace(p, p_mapping[p]) for p in punct: text = text.replace(p, f' {p} ') return text
def tokenize(text): # remove urls detected_urls = re.findall(url_regex, text) for url in detected_urls: text = text.replace(url, "urlplaceholder") # Normalize text text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower()) #tokenize tokens = word_tokenize(text) # Remove stop words tokens = [w for w in tokens if w not in stopwords.words("english")] # Lemmatize lemmatizer = WordNetLemmatizer() clean_tokens = [] for tok in tokens: clean_tok = lemmatizer.lemmatize(tok).strip() clean_tokens.append(clean_tok) return ' '.join(clean_tokens)
def preprocess_text(text): tokens = TOKENIZER.tokenize(text.replace("'", " ' ")) return ' '.join([ tok for tok in tokens if tok not in ENGLISH_STOP_WORDS and len(tok) > 1 and not all([char.isdigit() or char in [",.:-[]'`"] for char in tok]) ])
def clean_contractions(text, mapping): specials = ["’", "‘", "´", "`"] for s in specials: text = text.replace(s, "'") text = ' '.join( [mapping[t] if t in mapping else t for t in text.split(" ")]) return text
def text_to_wordlist(text, remove_stopwords=False, stem_words=False): # Clean the text, with the option to remove stopwords and to stem words. # Convert words to lower case and split them wiki_reg = r'https?://en.wikipedia.org/[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]' url_reg = r'https?://[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]' ip_reg = '\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}' WIKI_LINK = ' WIKI_LINK ' URL_LINK = ' URL_LINK ' IP_LINK = ' IP_LINK ' #clear link c = re.findall(wiki_reg, text) for u in c: text = text.replace(u, WIKI_LINK) c = re.findall(url_reg, text) for u in c: text = text.replace(u, WIKI_LINK) c = re.findall(wiki_reg, text) for u in c: text = text.replace(u, URL_LINK) c = re.findall(ip_reg, text) # Regex to remove all Non-Alpha Numeric and space special_character_removal = re.compile(r'[^A-Za-z\d!?*\' ]', re.IGNORECASE) # regex to replace all numerics replace_numbers = re.compile(r'\d+', re.IGNORECASE) # text = text.lower().split() text = text.split() # Optionally, remove stop words if remove_stopwords: stops = set(stopwords.words("english")) text = [w for w in text if not w in stops] text = " ".join(text) # Remove Special Characters text = special_character_removal.sub('', text) # Replace Numbers text = replace_numbers.sub('NUMBERREPLACER', text) # Optionally, shorten words to their stems if stem_words: text = text.split() stemmer = SnowballStemmer('english') stemmed_words = [stemmer.stem(word) for word in text] text = " ".join(stemmed_words) # Return a list of words return (text)
def text_to_data(self, text, author): #remove newlines, numbers, some punctuation text = text.replace('\n', " ") text = re.sub(r'[0-9]+', '', text) sent_tokenize_list = sent_tokenize(text) total_arr = [(x, author) for x in sent_tokenize_list] vocab_count = len(set(text.split(' '))) return total_arr, vocab_count
def clean_text(text): text = remove_urls(text) text = remove_users(text.replace('@ ', '@')) # text = remove_hash_tags(text) text = re.sub(r'\s+', ' ', text).strip() text = filter(lambda x: x in printable, text) text = re.sub(r"([\w/'+$\s-]+|[^\w/'+$\s-]+)\s*", r"\1 ", text) text = re.sub(r'\s+', ' ', text).strip() return text.lower()
def replace_num(text): import re p1 = r"[0-9]+" # 这是我们写的正则表达式规则,你现在可以不理解啥意思 pattern1 = re.compile(p1) # 我们在编译这段正则表达式 res = pattern1.findall(text) res.sort(key=lambda i: len(i), reverse=True) if len(res) > 0: # 如果匹配成功 for i in range(len(res)): new_text = text.replace(res[i], '') # 打印出来 text = new_text return text
def clean_contractions(text, mapping): ''' credits to: https://www.kaggle.com/christofhenkel/how-to-preprocessing-when-using-embeddings credits to: https://www.kaggle.com/anebzt/quora-preprocessing-model input: current text, contraction mappings output: modify the comments to use the base form from contraction mapping ''' specials = ["’", "‘", "´", "`"] for s in specials: text = text.replace(s, "'") text = ' '.join([mapping[t] if t in mapping else t for t in text.split(" ")]) return text
def clean_text(text): # clean latex maths text = p.sub(' [ math ] ', text) # clean invisible chars text = p_space.sub(r'', text) # clean punctuations for punct in punct_mapping: if punct in text: text = text.replace(punct, punct_mapping[punct]) tokens = [] for token in text.split(): # replace contractions & correct misspells token = mispell_dict.get(token.lower(), token) tokens.append(token) text = ' '.join(tokens) return text
def clean_text(text): #fixing apostrope text = text.replace("’", "'") #to lower text = text.lower() #remove \n text = re.sub("\\n","",text) # remove leaky elements like ip,user text = re.sub("\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}","",text) #Split the sentences into words words = tokenizer.tokenize(text) # (')aphostophe replacement (ie) you're --> you are # ( basic dictionary lookup : master dictionary present in a hidden block of code) words = [fill[word] if word in fill else word for word in words] #words = [lem.lemmatize(word, "v") for word in words] #words = [i for i in text.split() if i not in eng_stopwords] text = " ".join(words) return text
def preprocessing(path, label): path = './20_newsgroups/alt.atheism/' files = os.listdir(path) punct = "/-'?!#$%\'()*+-/:;<=>@[\\]^_`{|}~`" + '""“”’' + '∞θ÷α•à−β∅³π‘₹´°£€\×™√²—–&' + '\n\r\t' columns = ["label", "text"] dataset = pd.DataFrame(index=range(1000), columns=columns) i = 0 for file in files: # 遍历文件夹 if not os.path.isdir(file): # 判断是否是文件夹,不是文件夹才打开 f = os.path.basename(file) print(f) # 打印结果 paths = path + f infile = open(paths, encoding='ANSI', errors='ignore') text = infile.read() text = text.split('\n\n', maxsplit=1)[1] for p in punct: text = text.replace(p, ' ') text = re.sub('\s+', ' ', text) dataset.loc[i, 'text'] = text dataset.loc[i, 'label'] = label i += 1 return dataset
def remove_useless_symbols(text): symbols = "/-'?!.,#$%\'()*+-/:;<=>@[\\]^_`{|}~`" + '""“”’' + '∞θ÷α•à−β∅³π‘₹´°£€\×™√²—–&' + '\n' for symbol in symbols: text = text.replace(symbol, ' ') return text
def remove_hash_tags(text): hts = [part[1:] for part in text.split() if part.startswith('#')] for ht in hts: text = text.replace(ht, '') return text
def clean_special_chars(text, punct): for p in punct: text = text.replace(p, ' ') return text
def remove_urls(text): urls = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', text) for u in urls: text = text.replace(u, '') return text
def remove_users(text): users = re.findall("@([a-z0-9_]+)", text, re.I) for u in users: text = text.replace('@' + u, '') return text
def clean_special_chars(text, punct, mapping): for p in mapping: text = text.replace(p, mapping[p]) for p in punct: text = text.replace(p, f' {p} ') return text
def text_parse(text, remove_stopwords=False, stem_words=False): wiki_reg = r'https?://en.wikipedia.org/[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]' url_reg = r'https?://[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]' url_reg2 = r'www.[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]' ip_reg = '\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}' WIKI_LINK = ' WIKILINKREPLACER ' URL_LINK = ' URLLINKREPLACER ' IP_LINK = ' IPLINKREPLACER ' #clear link # replace endline with '. ' endline = re.compile(r'.?\n', re.IGNORECASE) text = endline.sub('. ', text) c = re.findall(wiki_reg, text) for u in c: text = text.replace(u, WIKI_LINK) c = re.findall(url_reg, text) for u in c: text = text.replace(u, URL_LINK) c = re.findall(url_reg2, text) for u in c: text = text.replace(u, URL_LINK) c = re.findall(ip_reg, text) for u in c: text = text.replace(u, IP_LINK) bad_word_dict = get_bad_word_dict() # Regex to remove all Non-Alpha Numeric and space special_character_removal = re.compile(r'[^A-Za-z\d!?*\'.,; ]', re.IGNORECASE) # regex to replace all numerics replace_numbers = re.compile(r'\b\d+\b', re.IGNORECASE) text = text.lower().split() # Optionally, remove stop words if remove_stopwords: stops = set(stopwords.words("english")) text = [w for w in text if not w in stops] text = " ".join(text) # Remove Special Characters text = special_character_removal.sub(' ', text) for k, v in bad_word_dict.items(): # bad_reg = re.compile('[!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n ]'+ re.escape(k) +'[!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n ]') bad_reg = re.compile('[\W]?' + re.escape(k) + '[\W]|[\W]' + re.escape(k) + '[\W]?') text = bad_reg.sub(' ' + v + ' ', text) ''' bad_reg = re.compile('[\W]'+ re.escape(k) +'[\W]?') text = bad_reg.sub(' '+ v, text) bad_reg = re.compile('[\W]?'+ re.escape(k) +'[\W]') text = bad_reg.sub(v + ' ', text) ''' # Replace Numbers text = replace_numbers.sub('NUMBERREPLACER', text) text = text.split() text = " ".join(text) if stem_words: text = text.split() stemmer = SnowballStemmer('english') stemmed_words = [stemmer.stem(word) for word in text] text = " ".join(stemmed_words) # rake parsing text = rake_parse(text) return text
def clean_punct(text): text = str(text) for punct in PUNCTS: text = text.replace(punct, ' {} '.format(punct)) return text