def Text_Cleanup(f): text = f.lower() text_contraction = cont.fix(text) text_punc = text_contraction.translate(translator) text_clean = ' '.join([word for word in text_punc.split() if(len(word.lower())>1)]) text_sent = ' '.join([i for i in text_clean.split() if(i.isalnum() and not i.isdigit())]) return text_sent
def replaceContractions(self, text): """Replace contractions in string of text""" return contractions.fix(text)
x23 = x22.replace(":-/", "sad") x24 = x23.replace(":/", "sad") x25 = x24.replace(":|", "sad") return x25 df['emoticons_replacment'] = df['textOriginal'].apply(smiley) #----------------------------------------------------------------------------------------------------------------------- df["less_spaces"] = df['emoticons_replacment'].apply( lambda x: re.sub(' +', ' ', x)) #https://towardsdatascience.com/preprocessing-text-data-using-python-576206753c28 df['text_expan_contractions'] = df['less_spaces'].apply( lambda x: [contractions.fix(word) for word in x.split()]) df['text_expan_contractions'] = [ ' '.join(map(str, l)) for l in df['text_expan_contractions'] ] #removes non alphanumeric/ whitespace characters from strings df['text_misc_char_removed'] = df['text_expan_contractions'].str.replace( ''', '') # just a lil something to replace the weird apostroph thing df['text_misc_char_removed'] = df['text_misc_char_removed'].map( lambda x: re.sub("[^0-9a-zA-Z\s]+", '', x) ) #this includes puncutation which shoes little value in analysis #removes emojis def deEmojify(text):
def con(text): expand = contractions.fix(text) return expand
def expand_contractions(text): """ expand shortened words, e.g. don't to do not """ return contractions.fix(text)
def replace_contractions(df): # return contractions.fix(t) # df['text_prep'] = df.text_prep.apply(lambda x: nltk.word_tokenize(contractions.fix(TreebankWordDetokenizer().detokenize(x)))) df['text_prep'] = df.text_prep.apply(lambda x: contractions.fix(x)) print('contractions expansion done') return df
def convert_emoticons(text): for emot, desc in EMOTICONS.items(): text = re.sub(u'(' + emot + ')', desc, text) return text udf_convert_emoticons = udf(convert_emoticons) def convert_contractions(text): return contractions.fix(text) # udf_convert_contractions = udf(convert_contractions) udf_convert_contractions = udf(lambda text: contractions.fix(text)) def convert_numbers_to_text(text): return ' '.join([num2words(w) if w.isdigit() else w for w in text.split()]) # udf_convert_numbers_to_text = udf(convert_numbers_to_text, ArrayType(StringType())) udf_convert_numbers_to_text = udf( lambda text: ' '.join( [num2words(w) if w.isdigit() else w for w in text.split()]), ArrayType(StringType())) def convert_date_to_text(text): result = []
def test_add(): contractions.add('mychange', 'my change') assert contractions.fix('mychange') == 'my change'
def data_cleaning(df): info = [df['Sentiment'].values.tolist(), df['Text'].values.tolist()] df_data = list(zip(*info)) clean_data = [] sentiment = [] pos = {} neg = {} counts = {} for i in range(0, len(df_data)): if int(df_data[i][0]) == 1: sentiment.append(1) if int(df_data[i][0]) == -1: sentiment.append(-1) #1) remove emails clean_sentence = re.sub(r'\s*\S*(@)\S*', '', str(df_data[i][1])) #remove mentions clean_sentence = re.sub(r"([@][\w_-]+)", "", clean_sentence) #2) remove 10 digit phone numbers #clean_sentence = re.sub(r'\d{10}', '', clean_sentence) #3) remove $n clean_sentence = re.sub(r'\$[^ ]+', '', clean_sentence) #4) remove Times & dates 2/24 2:10pm, 6/30, 7:00 AM clean_sentence = re.sub( r'[0-9]*[:/][0-9]*\S*\s[A][M]|[0-9]*[:/][0-9]*\S*\s[P][M]|[0-9]*[:/][0-9]*\S*', '', clean_sentence) #5) convert emojis clean_sentence = emoji.demojize(clean_sentence, delimiters=(' ', ' ')) #6) fix contractions clean_sentence = contractions.fix(clean_sentence) #7) remove links clean_sentence = re.sub(r'\s*\S*(http)\S*', '', clean_sentence) #8) keeping the hashtage info but removing the sign clean_sentence = clean_sentence.replace("#", "") #9) all to lower case for easy tokenization and less features #clean_sentence = clean_sentence.lower() #10) remove < > clean_sentence = clean_sentence.replace("<", "") clean_sentence = clean_sentence.replace(">", "") #11) remove Punctuations clean_sentence = re.sub(r'[^A-Za-z0-9]+', ' ', clean_sentence) #12) lemmatize verbs tokenized = word_tokenize(clean_sentence) lemmatizer = WordNetLemmatizer() clean_tokens = [] for word in tokenized: cur = lemmatizer.lemmatize(word, pos='v') clean_tokens.append(cur) #13) remove stop words stop_words = set(stopwords.words('english')) filtered_sentence = [w for w in clean_tokens if not w in stop_words] clean_data.append(filtered_sentence) for word in filtered_sentence: if sentiment[i] == 1: if word in pos and not word.isupper(): pos[word] += 1 else: pos[word] = 1 else: if word in neg and not word.isupper(): neg[word] += 1 else: neg[word] = 1 if word not in counts: counts[word] = 1 else: counts[word] += 1 neg = {key: val for key, val in neg.items() if val != 1} pos = {key: val for key, val in pos.items() if val != 1} alldata = {key: val for key, val in counts.items() if val != 1} #print(clean_data[5]) #print(len(sentiment) == len(clean_data)) f = open("pos.txt", "w") f.write(str(pos)) f.close() f = open("neg.txt", "w") f.write(str(neg)) f.close() f = open("counts.txt", "w") f.write(str(alldata)) f.close() a_dictionary = dict(Counter(counts).most_common(50)) keys = a_dictionary.keys() values = a_dictionary.values() plt.xticks(fontsize=6) plt.bar(keys, values, color='pink') plt.show() #check for duplicates new_clean_data = [] for i in range(0, len(clean_data)): cur_data = (' '.join(clean_data[i]), sentiment[i]) if cur_data not in new_clean_data: #new_clean_data.append(clean_data[i]) #new_senti.append(sentiment[i]) new_clean_data.append(cur_data) #print(new_clean_data[:3]) return new_clean_data
def contraction_expansion(text): text = contractions.fix(text) return text
def remove_contractions(x): rem_cont = [contractions.fix(word) for word in x.split()] return " ".join(map(str, rem_cont))
def standardised_query(pl, text): text = remove_punctuation(text) text = contractions.fix(text) text = lemmatise(text) return escape_and_call_prolexa(pl, text)
def get_token_words(survey_data, col_name, stopwords_list, title): """ Return a list of all the token words that we can use to generate the bigrams. survey_data: Pandas data frame, contains data to analyze col_name: Name of specific column to analyze responses stopwords_list: List of words to ignore that might be in the question title: Title of sentiment """ # Drop null values, reset the index print(survey_data.columns) data = survey_data.dropna(subset=[col_name]) data = data.reset_index(drop=True) # Get only the column that you need responses_data = data[col_name] # Make it a list without the col name responses = [] for i in range(len(responses_data) - 1): responses.append(str(responses_data[i + 1])) sentiment_list = sentiment(responses) sent = [] for i in range(len(sentiment_list)): if sentiment_list[i] != "n/a": if sentiment_list[i] > 0: sent.append("pos") elif sentiment_list[i] == 0: sent.append("neutral") else: sent.append("neg") else: sent.append("n/a") sentiment_table = pd.DataFrame({'sentiment': sentiment_list}) sentiment_table['sent_word'] = sent sentiment_table['responses'] = responses sentiment_table.to_csv("files/Neut_" + title, index=False) processed = [] index = 0 while index < len(responses): # Look at words in one response # lowercase responsewords = responses[index].lower() # remove punctuation responsewords = re.sub('[!#?,.:";\']', "", responsewords) # split into a list resultwords = responsewords.split() # expand contractions for i in range(len(resultwords)): word = resultwords[i] resultwords[i] = contractions.fix(word) # remove stopwords for word in stopwords_list: if word in resultwords: resultwords.remove(word) # Joing back as text processed.append(" ".join(resultwords)) index += 1 # Join the responses into one big text text = " ".join(processed) # Stemming: removes suffices: ing, ly, s # Lemmatization (root word) st = PorterStemmer() lemmatizer = WordNetLemmatizer() text_stem = [] for word in text.split(" "): word = st.stem(word) text_stem.append(lemmatizer.lemmatize(word)) text = " ".join(text_stem) # Passing the string text into word tokenize for breaking the sentences tokens = word_tokenize(text) # Word cloud #wordcloud = WordCloud(background_color="white").generate(text) #plt.imshow(wordcloud, interpolation='bilinear') #plt.axis("off") #plt.show() # Removing english stopwords eng_stopwords = set(stopwords.words("english")) tokens = [w for w in tokens if w not in eng_stopwords] return tokens
def replace_contractions(text): """Replaces contractions (it's -> it is)""" return contractions.fix(text)
def fix_contractions(s): s = contractions.fix(s) return s
wordset = set() # initialise empty set of words bodies = list() # initialise empty set to contain bodies of text gathered with open('text.txt', 'w') as f: for tag_idx in range(1, 174): print(tags[tag_idx]) href = tags[tag_idx].attrs['href'] page = urllib.request.urlopen(site + href) page_soup = bs(page, 'html.parser') #print(page_soup.prettify()) text = page_soup.find('font', face='verdana') #print(str(text)) print('\n\n\n') text = text.text # get just the text (remove tags etc) f.write(text) text = re.sub('\[[^]]*\]', '', text) # remove square brackets text = contractions.fix(text) # replaced contractions with their full words words = nltk.word_tokenize(text) # make list of word tokens words = [word.lower() for word in words] # lowercase words = [re.sub(r'[^\w\s]', '', word) for word in words] # replace punctuation with empty string words = [word for word in words if word != ''] # remove empty strings bodies.append(words) print(bodies) wordset = wordset.union(words) # add any new words in this body to the set of words break print('Number of individual words:', len(wordset)) #print(text) print(words)
def test_fix(): assert contractions.fix("you're happy now") == "you are happy now"
def replace_contractions(text, verbose=False): """Replace contractions in string of text""" new_text = contractions.fix(text) if verbose: print(new_text) return new_text
def remove_contractions(text): return contractions.fix(text)
def replace_contractions(self, text): return contractions.fix(text)
def convert_contractions(text): return contractions.fix(text)
def preprocess(temp): # expand using contractions temp = re.sub(r"(http|https)\S+", "", temp) temp = contractions.fix(temp) # tokenize tokens = nltk.word_tokenize(temp) #tokens = tokenizer.tokenize(temp) # for i,token in enumerate(tokens): # if token[0].isupper(): # print("{}:{}:{}".format(i,file,token)) string.punctuation = string.punctuation + "''``--" #print(tokens) new_tokens = [] pattern = ('\d+(\.\d+)?') for i, token in enumerate(tokens): if i == len(tokens): break if (token == '@'): #replace twitter handle with screen name temp = tokens[i] + tokens[i + 1] #print(temp) if get_username(temp) is not None: #print(get_username(temp)) temp_list = nltk.word_tokenize(get_username(temp)) for t in temp_list: new_tokens.append(t.lower()) i = i + 2 continue if len(token) < 3: continue if token in english_stopwords: continue if (token not in string.punctuation): if not re.match(pattern, token): for s in token: if s in string.punctuation: token = token.replace(s, '') else: if token.isdigit(): token = num2words(float(token)) new_tokens.append(token.lower().encode("ascii", errors="ignore").decode()) ''' # lemmitization lemmatizer = nltk.WordNetLemmatizer() final_tokens=[] for token in new_tokens: final_tokens.append(lemmatizer.lemmatize(token)) # stemmer stemmer = nltk.PorterStemmer() final_tokens = [] for token in new_tokens: final_tokens.append(stemmer.stem(token)) ''' return new_tokens
start_time = time.time() vocab_full = {} n_doc = 0 # Only keep the data dictionaries and ignore possible system files like .DS_Store folders = [ os.path.join(root_path, name) for name in os.listdir(root_path) if os.path.isdir(os.path.join(root_path, name)) ] for folder in folders: for filename in os.listdir(folder): file = os.path.join(folder, filename) n_doc += 1 with open(file, 'r', encoding='utf8', errors='ignore') as f: for line in f: # split contractions into two words line = contractions.fix(line) tokens = word_tokenize(line) # force everything to lower case and remove non-alphabetic characters tokens = [token.lower() for token in tokens if token.isalpha()] for token in tokens: # remove stop words, other words (above) and single characters if (token not in stop_words) and ( token not in other_words) and (len(token) > 1): vocab_full[token] = vocab_full.get(token, 0) + 1 print( f'{n_doc} documents in total with a total vocab size of {len(vocab_full)}') vocab_sorted = sorted(vocab_full.items(), key=operator.itemgetter(1), reverse=True) vocab_truncated = vocab_sorted[:MAX_VOCAB_SIZE] # Save the vocabulary to file for visual inspection and possible analysis
def expand_contractions(text): text_uncont = contractions.fix(text) return text_uncont
def remove_html_tags(text): soup = BeautifulSoup(text, "html.parser") text = soup.get_text() text = contractions.fix(text) return text
def expand_contraction(input_text: str) -> str: """ Expand contractions in input text """ return contractions.fix(input_text)
def replace_contractions(txt): return contractions.fix(txt)
def replace_contractions(text): return (contractions.fix(text))
def remove_contractions(k): return k.apply(lambda x: contractions.fix(x)) # don't - do not
def expand_contractions(text): return contractions.fix(text)
def _replace_contractions(text): """Replace contractions in string of text""" return contractions.fix(text)