def get_hashtags_and_user_mentions(special_characters, text, wanted_characters=['#', '@']): # Identify hashtags, user mentions and remove urls results = {} for character in special_characters: text = re.sub('(' + character + ')+', ' ' + character, text) count_character = text.count(character) if count_character > 0: while count_character > 0: start = text.find(character) print(text.find(" ", start)) print(text.find("\n", start)) if text.find(" ", start) <= text.find("\n", start): end = text.find(" ", start) else: end = text.find("\n", start) if end == -1: end = len(text) text_to_remove = text[start:end] print(text_to_remove) if len(text_to_remove) > 2: if character in wanted_characters: if character in results.keys(): results[character].append(text_to_remove) else: results[character] = [text_to_remove] text = text.replace(text_to_remove, "") text = ' '.join(text.split()) count_character = text.count(character) for wanted_character in wanted_characters: if wanted_character not in results.keys(): results[wanted_character] = [] text = text.strip(' ') text = ' '.join(text.split()) results['clean_text'] = text return results
def clean_text(self, text): """ # Arguments text: text body to be preprocessed and cleaned # Return cleaned text """ # handle non-ascii/special characters text = text.encode("utf-8") text = re.sub(r"\\[ux][a-z0-9]+", " ", str(text)) text = str(text).replace("b", "") text = text.strip("'").lower() text = re.sub(r'[\:\-\(\)\%\d\.\\\/\_\[\]\+\,\#\"]+', ' ', text) text = re.sub(r'\s+', ' ', text) word_list = text.split(' ') # tokenization w.r.t space characters rel_words = [ word for word in word_list if word not in self.stop and len(word) >= self.min_word_len ] # relevant words rel_words_lemm = [ self.lemmatizer.lemmatize(word, pos='v') for word in rel_words ] return " ".join(rel_words_lemm)
def clean_string(text): def pad_str(s): return ' ' + s + ' ' # Empty question if type(text) != str or text == '': return '' # preventing first and last word being ignored by regex # and convert first word in question to lower case text = ' ' + text[0].lower() + text[1:] + ' ' # replace all first char after either [.!?)"'] with lowercase # don't mind if we lowered a proper noun, it won't be a big problem def lower_first_char(pattern): matched_string = pattern.group(0) return matched_string[:-1] + matched_string[-1].lower() text = re.sub("(?<=[\.\?\)\!\'\"])[\s]*.", lower_first_char, text) # Replace weird chars in text text = re.sub("’", "'", text) # special single quote text = re.sub("`", "'", text) # special single quote text = re.sub("“", '"', text) # special double quote text = re.sub("?", "?", text) text = re.sub("…", " ", text) text = re.sub("é", "e", text) # Clean shorthands text = re.sub( "\'s", " ", text ) # we have cases like "Sam is" or "Sam's" (i.e. his) these two cases aren't separable, I choose to compromise are kill "'s" directly text = re.sub(" whats ", " what is ", text, flags=re.IGNORECASE) text = re.sub("\'ve", " have ", text) text = re.sub("can't", "can not", text) text = re.sub("n't", " not ", text) text = re.sub("i'm", "i am", text, flags=re.IGNORECASE) text = re.sub("\'re", " are ", text) text = re.sub("\'d", " would ", text) text = re.sub("\'ll", " will ", text) text = re.sub("e\.g\.", " eg ", text, flags=re.IGNORECASE) text = re.sub("b\.g\.", " bg ", text, flags=re.IGNORECASE) text = re.sub(r"(\W|^)([0-9]+)[kK](\W|$)", r"\1\g<2>000\3", text) # better regex provided by @armamut text = re.sub("e-mail", " email ", text, flags=re.IGNORECASE) text = re.sub("(the[\s]+|The[\s]+)?U\.S\.A\.", " America ", text, flags=re.IGNORECASE) text = re.sub("(the[\s]+|The[\s]+)?United State(s)?", " America ", text, flags=re.IGNORECASE) text = re.sub("\(s\)", " ", text, flags=re.IGNORECASE) text = re.sub("[c-fC-F]\:\/", " disk ", text) # replace the float numbers with a random number, it will be parsed as number afterward, and also been replaced with word "number" text = re.sub('[0-9]+\.[0-9]+', " 87 ", text) # remove comma between numbers, i.e. 15,000 -> 15000 text = re.sub('(?<=[0-9])\,(?=[0-9])', "", text) # # all numbers should separate from words, this is too aggressive # def pad_number(pattern): # matched_string = pattern.group(0) # return pad_str(matched_string) # text = re.sub('[0-9]+', pad_number, text) # add padding to punctuations and special chars, we still need them later text = re.sub('\$', " dollar ", text) text = re.sub('\%', " percent ", text) text = re.sub('\&', " and ", text) def pad_pattern(pattern): matched_string = pattern.group(0) return pad_str(matched_string) text = re.sub('[\!\?\@\^\+\*\/\,\~\|\`\=\:\;\.\#\\\]', pad_pattern, text) text = re.sub('[^\x00-\x7F]+', pad_str(SPECIAL_TOKENS['non-ascii']), text) # replace non-ascii word with special word # indian dollar text = re.sub("(?<=[0-9])rs ", " rs ", text, flags=re.IGNORECASE) text = re.sub(" rs(?=[0-9])", " rs ", text, flags=re.IGNORECASE) # clean text rules get from : https://www.kaggle.com/currie32/the-importance-of-cleaning-text text = re.sub(r" (the[\s]+|The[\s]+)?US(A)? ", " America ", text) text = re.sub(r" UK ", " England ", text, flags=re.IGNORECASE) text = re.sub(r" india ", " India ", text) text = re.sub(r" switzerland ", " Switzerland ", text) text = re.sub(r" china ", " China ", text) text = re.sub(r" chinese ", " Chinese ", text) text = re.sub(r" imrovement ", " improvement ", text, flags=re.IGNORECASE) text = re.sub(r" intially ", " initially ", text, flags=re.IGNORECASE) text = re.sub(r" quora ", " Quora ", text, flags=re.IGNORECASE) text = re.sub(r" dms ", " direct messages ", text, flags=re.IGNORECASE) text = re.sub(r" demonitization ", " demonetization ", text, flags=re.IGNORECASE) text = re.sub(r" actived ", " active ", text, flags=re.IGNORECASE) text = re.sub(r" kms ", " kilometers ", text, flags=re.IGNORECASE) text = re.sub(r" cs ", " computer science ", text, flags=re.IGNORECASE) text = re.sub(r" upvote", " up vote", text, flags=re.IGNORECASE) text = re.sub(r" iPhone ", " phone ", text, flags=re.IGNORECASE) text = re.sub(r" \0rs ", " rs ", text, flags=re.IGNORECASE) text = re.sub(r" calender ", " calendar ", text, flags=re.IGNORECASE) text = re.sub(r" ios ", " operating system ", text, flags=re.IGNORECASE) text = re.sub(r" gps ", " GPS ", text, flags=re.IGNORECASE) text = re.sub(r" gst ", " GST ", text, flags=re.IGNORECASE) text = re.sub(r" programing ", " programming ", text, flags=re.IGNORECASE) text = re.sub(r" bestfriend ", " best friend ", text, flags=re.IGNORECASE) text = re.sub(r" dna ", " DNA ", text, flags=re.IGNORECASE) text = re.sub(r" III ", " 3 ", text) text = re.sub(r" banglore ", " Banglore ", text, flags=re.IGNORECASE) text = re.sub(r" J K ", " JK ", text, flags=re.IGNORECASE) text = re.sub(r" J\.K\. ", " JK ", text, flags=re.IGNORECASE) # typos identified with my eyes text = re.sub(r" quikly ", " quickly ", text) text = re.sub(r" unseccessful ", " unsuccessful ", text) text = re.sub(r" demoniti[\S]+ ", " demonetization ", text, flags=re.IGNORECASE) text = re.sub(r" demoneti[\S]+ ", " demonetization ", text, flags=re.IGNORECASE) text = re.sub(r" addmision ", " admission ", text) text = re.sub(r" insititute ", " institute ", text) text = re.sub(r" connectionn ", " connection ", text) text = re.sub(r" permantley ", " permanently ", text) text = re.sub(r" sylabus ", " syllabus ", text) text = re.sub(r" sequrity ", " security ", text) text = re.sub(r" undergraduation ", " undergraduate ", text) # not typo, but GloVe can't find it text = re.sub(r"(?=[a-zA-Z])ig ", "ing ", text) text = re.sub(r" latop", " laptop", text) text = re.sub(r" programmning ", " programming ", text) text = re.sub(r" begineer ", " beginner ", text) text = re.sub(r" qoura ", " Quora ", text) text = re.sub(r" wtiter ", " writer ", text) text = re.sub(r" litrate ", " literate ", text) # for words like A-B-C-D or "A B C D", # if A,B,C,D individuaally has vector in glove: # it can be treat as separate words # else: # replace it as a special word, A_B_C_D is enough, we'll deal with that word later # # Testcase: 'a 3-year-old 4 -tier car' def dash_dealer(pattern): matched_string = pattern.group(0) splited = matched_string.split('-') splited = [sp.strip() for sp in splited if sp != ' ' and sp != ''] joined = ' '.join(splited) parsed = nlp(unicode(joined)) for token in parsed: # if one of the token is not common word, then join the word into one single word if not token.has_vector or token.text in SPECIAL_TOKENS.values(): return '_'.join(splited) # if all tokens are common words, then split them return joined text = re.sub("[a-zA-Z0-9\-]*-[a-zA-Z0-9\-]*", dash_dealer, text) # try to see if sentence between quotes is meaningful # rule: # if exist at least one word is "not number" and "length longer than 2" and "it can be identified by SpaCy": # then consider the string is meaningful # else: # replace the string with a special word, i.e. quoted_item # Testcase: # i am a good (programmer) -> i am a good programmer # i am a good (programmererer) -> i am a good quoted_item # i am "i am a" -> i am quoted_item # i am "i am a programmer" -> i am i am a programmer # i am "i am a programmererer" -> i am quoted_item def quoted_string_parser(pattern): string = pattern.group(0) parsed = NLP(unicode(string[1:-1])) is_meaningful = False for token in parsed: # if one of the token is meaningful, we'll consider the full string is meaningful if len(token.text) > 2 and not token.text.isdigit( ) and token.has_vector: is_meaningful = True elif token.text in SPECIAL_TOKENS.values(): is_meaningful = True if is_meaningful: return string else: return pad_str(string[0]) + SPECIAL_TOKENS['quoted'] + pad_str( string[-1]) text = re.sub('\".*\"', quoted_string_parser, text) text = re.sub("\'.*\'", quoted_string_parser, text) text = re.sub("\(.*\)", quoted_string_parser, text) text = re.sub("\[.*\]", quoted_string_parser, text) text = re.sub("\{.*\}", quoted_string_parser, text) text = re.sub("\<.*\>", quoted_string_parser, text) text = re.sub('[\(\)\[\]\{\}\<\>\'\"]', pad_pattern, text) # the single 's' in this stage is 99% of not clean text, just kill it text = re.sub(' s ', " ", text) # reduce extra spaces into single spaces text = re.sub('[\s]+', " ", text) text = text.strip() return text
def clean_text(text): text = text.lower() text = re.sub(r'[^a-zA-Z0-9_\s]+', '', text) text = text.strip(' ') return text
def cloak_textfooler(text, classifier_func, importance_func = importance_scores, select_func = lambda x,y: select_non_stopword(x), synonym_func = None, candidate_word_filter = None, candidate_sentence_filter = None, sim_func = None, sim_threshold = 0.8): text = text.strip() # Note that we're going to be splitting and rejoining this a lot, so all whitespace is equivalent # Determine importance scores for each word in the text. importance = importance_func(text, classifier_func) words = text.split() # Cull wordlist. selected_words_indexes = list() selected_words_importance = list() for word_id, word in enumerate(words): if select_func(word, importance[word_id]): selected_words_indexes.append(word_id) selected_words_importance.append(importance[word_id]) # Sort wordlist by importance. sorted_word_indexes = [x for _,x in sorted(zip(selected_words_importance, selected_words_indexes),reverse=True)] # First, find our baseline prediction. current_text = ' '.join(words) orig_probs = classifier_func(current_text) orig_class = np.argmax(orig_probs) # MAIN LOOP: # For each word, in sorted order, identify synonyms, find the best candidate, and replace that word with it. # If we're able to change the predicted class, break. If not, keep our replacement, and keep going. for word_index in sorted_word_indexes: # Expand word into a set of candidates. word = words[word_index] candidates = synonym_func(word) # If we were provided a candidate filter at the word level, apply that now. if candidate_word_filter: filtered_candidates = list() for candidate in candidates: if candidate_word_filter(word,candidate): filtered_candidates.append(candidate) candidates = filtered_candidates # For each candidate, try replacing the word with that candidate. Remove candidate if it fails similarity test. final_candidates = list() final_class = list() final_probs = list() for candidate in candidates: new_text = ' '.join(words[:word_index] + [candidate] + words[word_index+1:]) # If we were provided a sentence level candidate filter, apply that now. if candidate_sentence_filter: if not candidate_sentence_filter(text,new_text): continue # Assuming we passed the filter, check the similarity threshold. if sim_func(text, new_text) > sim_threshold: # If we pass, add this candidate to the final listing, along with its class probabilities and assignment. final_candidates.append(candidate) cand_probs = classifier_func(new_text) final_probs.append(cand_probs) final_class.append(np.argmax(cand_probs)) # If no candidates remain, skip this word and keep looping. if len(final_candidates) == 0: continue # Find all candidates that break class. class_breakers = list() # zip together sentence with class for candidate_val in zip(final_candidates,final_class): if candidate_val[1] != orig_class: class_breakers.append(candidate_val[0]) # If there's any candidates that change class: if len(class_breakers) > 0: # Remove all candidates that don't. # Select the one with the best similarity, and make the replacement. sims = list() for candidate in class_breakers: sims.append(sim_func(text,candidate)) chosen = class_breakers[np.argmax(sims)] # Break the loop. return chosen # Otherwise, select remaining candidate that maximizes probability damage.. damage = list() for probs in final_probs: damage.append(orig_probs[orig_class] - probs[orig_class]) # If no candidates improve our odds, leave the word as is. # Otherwise, replace text with selected candidate. if max(damage) > 0: current_text = final_candidates[np.argmax(damage)] # Keep looping. # If we get to the end of the loop without breaking, return what we've got as a failure. print("Unable to find successful attack against sentence:") print(text) print("Best candidate:") print(current_text) return current_text
def clean_html_and_extract_text(raw_html): ''' Clean an html string that comes from "cleaned_value" column ''' # global foo ## use regular expressions to remove roman numberals inside brackets ## eg. (iv), (ix) etc. raw_html = re.sub('\([v|i|x]+\)', '', raw_html) # raw_html = re.sub('\s\d+\s', '', raw_html) ## clear off the non ascii characters, remove the html tags ## and get just the text from the document raw_html = bytes(raw_html, 'utf-16').decode("utf-16", 'ignore') _cleantext = BeautifulSoup(raw_html, 'lxml') for e in _cleantext.findAll('br'): e.replace_with(" ") _cleantext = _cleantext.getText(separator=u' ') cleantext = _cleantext cleantext = " ".join(cleantext.split()) cleantext = ''.join(x for x in cleantext if x in string.printable) # foo.append(cleantext) # for checking on various libraries # extract_fog_score(cleantext) ## clear off punctuations in the text ''' table = cleantext.maketrans("","", string.punctuation) cleantext = cleantext.translate(table) ''' ''' New implementation to remove the punctuation and replace with space Ref: https://stackoverflow.com/questions/42614458/how-to-replace-punctuation-with-whitespace ''' punc_list = list(string.punctuation) translator = cleantext.maketrans(dict.fromkeys(punc_list, " ")) cleantext = cleantext.lower().translate(translator) ## clear off numbers and normalize spaces between words ## and lowercase it cleantext = " ".join([ text for text in cleantext.split(" ") if text.strip() is not "" ]).lower() ''' cleantext = " ".join([text for text in cleantext.split(" ") if text.strip() is not "" and text.isdigit() is False]).lower() ''' ## remove any non-printable (non-ascii) characters in the text printable = set(string.printable) cleantext = list(filter(lambda x: x in printable, cleantext)) cleantext = "".join(cleantext) ## remove roman numberals from string which ## are not in brackets toremove = [ ' ii ', ' iii ', ' iv ', ' v ', ' vi ', ' vii ', ' viii ', ' ix ', ' x ', '!', '@', '#', '$', '%', '^', '&', '*', '$.' ] text_array = cleantext.split("\s+") cleantext = [word.strip() for word in text_array if word not in toremove] cleantext = " ".join(cleantext) ## clear off all arabic numerals / digits in the text which are attached ## together with text numbers = [1] while (len(numbers) != 0): numbers = re.findall('\d+', cleantext) for number in numbers: cleantext = cleantext.replace(number, " ") cleantext = re.sub(' +', ' ', cleantext) return cleantext.strip() # fog_index2 = textstat.textstat.gunning_fog(cleantext) # https://github.com/mmautner/readability # readability = Readability(cleantext) # fog_index3 = Readability.GunningFogIndex() # import ipdb # ipdb.set_trace() return fog_index
print(character) start = text.find(character) end = text.find(" ", start) if end == -1: end = len(text) text_to_remove = text[start:end] print(text.count(text_to_remove)) print(text_to_remove) if character == "#": hashtags.append(text_to_remove) elif character == "@": user_mentions.append(text_to_remove) text = text.replace(text_to_remove, "") text = ' '.join(text.split()) count_character = text.count(character) text = text.strip(' ') text = ' '.join(text.split()) print(text) print(user_mentions) print(hashtags) path = food_detection_root.ROOT_DIR + os.path.sep + 'data' + os.path.sep what_food_list_file = codecs.open(path + "list - what_food.txt", encoding='utf-8') what_food_list = what_food_list_file.read().splitlines() hashtags_with_what_words = [] for hashtag in hashtags: for word in what_food_list: if word in hashtag: print(word) hashtags_with_what_words.append(hashtag)