def clean_text_round1(text): '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.''' text = text.lower() text = text.replace('-', ' ') text = text.replace("’s ", " ").replace("' ", " ") text = text.replace("s’ ", " ").replace("s' ", " ") #remove proper possesives text = re.sub('\[.*?\]', '', text) text = re.sub('[%s]' % re.escape(string.punctuation), '', text) text = re.sub('\w*\d\w*', '', text) return text
def removestop(text): text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text) text = text.lower().split() stops = ['so', 'his', 't', 'y', 'ours', 'herself', 'your', 'all', 'some', 'they', 'i', 'of', 'didn', 'them', 'when', 'will', 'that', 'its', 'because', 'while', 'those', 'my', 'don', 'again', 'her', 'if', 'further', 'now', 'does', 'against', 'won', 'same', 'a', 'during', 'who', 'here', 'have', 'in', 'being', 'it', 'other', 'once', 'itself', 'hers', 'after', 're', 'just', 'their', 'himself', 'theirs', 'whom', 'then', 'd', 'out', 'm', 'mustn', 'where', 'below', 'about', 'isn', 'shouldn', 'wouldn', 'these', 'me', 'to', 'doesn', 'into', 'the', 'until', 'she', 'am', 'under', 'how', 'yourself', 'couldn', 'ma', 'up', 'than', 'from', 'themselves', 'yourselves', 'off', 'above', 'yours', 'having', 'mightn', 'needn', 'on', 'too', 'there', 'an', 'and', 'down', 'ourselves', 'each', 'hadn', 'ain', 'such', 've', 'did', 'be', 'or', 'aren', 'he', 'should', 'for', 'both', 'doing', 'this', 'through', 'do', 'had', 'own', 'but', 'were', 'over', 'not', 'are', 'few', 'by', 'been', 'most', 'no', 'as', 'was', 'what', 's', 'is', 'you', 'shan', 'between', 'wasn', 'has', 'more', 'him', 'nor', 'can', 'why', 'any', 'at', 'myself', 'very', 'with', 'we', 'which', 'hasn', 'weren', 'haven', 'our', 'll', 'only', 'o', 'before'] text = [w for w in text if not w in stops] text = " ".join(text) text = text.replace("."," ").replace(","," ") return(text)
def named_entities(text): ''' Replaces all named entities before vectorization. ''' for k, v in entities.items(): text = text.replace(k, v) return text
def remove_special_characters(text): ''' Doesn't remove digits Input: "Well this was fun! What do you think?\n 123#@!__ 123_" Output: "Well this was fun What do you think\n 123 123" ''' text = text.replace('&', 'and') pattern = r'[^a-zA-z0-9\s]|[_\^\\\`\[\]]' #r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]' # [\s] == [ \t\n\r\f\v] True text = re.sub(pattern, '', text) return text
def corpus_specific_text_cleaning(text): """ For performing corpus specific cleaning. Added to this file, since it needs to be adapted to the corpus and therefore a kind of configuration """ text = text.replace('"full_text" : ', "").strip().replace('"', '').replace( '\\n*', ' ').replace('\\', ' ').replace('&', ' ').replace("'ve", ' have') text = text.replace("don't", 'do not').replace("doesn't", 'does not').replace( "Don't", 'Do not').replace("Doesn't", 'Does not') text = text.replace("_NEWLINE_", " ").replace( "_CITATION_PREVIOUS_POST_PARAGRAPH", " ").replace("_CITATION_PREVIOUS_POST_", " ").replace("_POSTER_", " ") no_links = [] for word in text.split(" "): if "//" not in word and "http" not in word and "@" not in word: no_links.append(word) cleaned_text = " ".join(no_links) return cleaned_text
def remove_extra_meaningless_newlines(text): ''' Input: "word1\r\n word2\t\r\n \r\n \r\n word3 \n \r \n \n \n mst" Output: "word1\nword2\t\nword3 \nmst" -->removed leading whitespaces in the new line and extra \n also ''' text_cln = text.replace('\r', '\n') while True: len_ini = len(text_cln) text_cln = re.sub('\\n +\\n', '\\n', text_cln) if (len(text_cln) == len_ini): break text_cln = '\n'.join([ele for ele in text_cln.split('\n') if len(ele) > 0]) text_cln = re.sub('\\n *', '\\n', text_cln) # print(text_orig);print('-'*10);print(text) return text_cln
def _remove_personality_types(text: str) -> str: """Removes all mentions of personality types from test to avoid overfitting.""" for ptype in config["types"]: if ptype in text or ptype.upper() in text: text = text.replace(ptype, "") text = text.replace(ptype + "s", "") text = text.replace(ptype + "'s", "") text = text.replace(ptype.upper(), "") text = text.replace(ptype.upper() + "s", "") text = text.replace(ptype.upper() + "'s", "") return text
def clean_text(text, country): text = reduce(lambda a, kv: a.replace(*kv), contractions.items(), text.lower()) text = text.replace('\t', ' ').replace('\n', ' ').replace('\r', ' ') text = strip_accents(text) text = text.translate( str.maketrans(string.punctuation, ' ' * len(string.punctuation))) tokens = tk.tokenize(text) if country == 'USA': stopwords = usa_stopwords elif country == 'Canada': stopwords = canada_stopwords elif country == 'UK': stopwords = britain_stopwords else: raise ValueError("Country is invalid.") tokens = [ w for w in tokens if w not in stopwords and len(w) > 2 and w != ' ' and not w.isdigit() ] return ' '.join(tokens)
def get_hashtags_and_user_mentions(special_characters, text, wanted_characters=['#', '@']): # Identify hashtags, user mentions and remove urls results = {} for character in special_characters: text = re.sub('(' + character + ')+', ' ' + character, text) count_character = text.count(character) if count_character > 0: while count_character > 0: start = text.find(character) print(text.find(" ", start)) print(text.find("\n", start)) if text.find(" ", start) <= text.find("\n", start): end = text.find(" ", start) else: end = text.find("\n", start) if end == -1: end = len(text) text_to_remove = text[start:end] print(text_to_remove) if len(text_to_remove) > 2: if character in wanted_characters: if character in results.keys(): results[character].append(text_to_remove) else: results[character] = [text_to_remove] text = text.replace(text_to_remove, "") text = ' '.join(text.split()) count_character = text.count(character) for wanted_character in wanted_characters: if wanted_character not in results.keys(): results[wanted_character] = [] text = text.strip(' ') text = ' '.join(text.split()) results['clean_text'] = text return results
def corpus_specific_text_cleaning(text): text = text.replace(" ,", ",")\ .replace("( ", " (")\ .replace(" )", ") ") return text
dictionary, sort_topics=False, n_jobs=-1) #, R=15) pyLDAvis.save_html(lda_display, displayname) webbrowser.open(displayname, new=2) #save backup pickle.dump(ldamodel, open('ldamodel.pkl', 'wb')) #save topics and terms topics = pd.DataFrame( ldamodel.print_topics(num_topics=NUM_TOPICS, num_words=30)).drop(columns=0) for i in range(len(topics)): text = topics.iat[i, 0] topics.iat[i, 0] = text.replace('*', ' ') #clean for easier reading topics.to_csv('gensim_topics.csv', index=False, header=False) """apply LDA model to each consolidated document for each category""" #merge data combined_text = pd.DataFrame(raw_data['Cat'].unique().tolist()) corpusdf = combined_text.copy(deep=True) for i in range(0, len(combined_text)): Cat = combined_text.iat[i, 0] corpusdf.iat[i, 0] = raw_data[raw_data['Cat'].str.match(Cat)]['1'].str.cat( sep=' ') #clean data
def replace_words(text, dicty): for i, j in dicty.items(): text = text.replace(i, j) return text
if count_character > 0: while count_character > 0: print(count_character) print(character) start = text.find(character) end = text.find(" ", start) if end == -1: end = len(text) text_to_remove = text[start:end] print(text.count(text_to_remove)) print(text_to_remove) if character == "#": hashtags.append(text_to_remove) elif character == "@": user_mentions.append(text_to_remove) text = text.replace(text_to_remove, "") text = ' '.join(text.split()) count_character = text.count(character) text = text.strip(' ') text = ' '.join(text.split()) print(text) print(user_mentions) print(hashtags) path = food_detection_root.ROOT_DIR + os.path.sep + 'data' + os.path.sep what_food_list_file = codecs.open(path + "list - what_food.txt", encoding='utf-8') what_food_list = what_food_list_file.read().splitlines() hashtags_with_what_words = [] for hashtag in hashtags: for word in what_food_list:
def remove_punct(text): text = text.replace("/", " or ") text = regex.sub('', text) return text